LLVM 19.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
26
27using namespace llvm;
28using namespace llvm::AMDGPU;
29
30#define DEBUG_TYPE "si-memory-legalizer"
31#define PASS_NAME "SI Memory Legalizer"
32
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37namespace {
38
40
41/// Memory operation flags. Can be ORed together.
42enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47};
48
49/// Position to insert a new instruction relative to an existing
50/// instruction.
51enum class Position {
52 BEFORE,
53 AFTER
54};
55
56/// The atomic synchronization scopes supported by the AMDGPU target.
57enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
62 AGENT,
63 SYSTEM
64};
65
66/// The distinct address spaces supported by the AMDGPU target for
67/// atomic memory operation. Can be ORed together.
68enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86};
87
88class SIMemOpInfo final {
89private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101 bool IsLastUse = false;
102
103 SIMemOpInfo(
104 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
105 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
106 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
107 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
108 bool IsCrossAddressSpaceOrdering = true,
109 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
110 bool IsVolatile = false, bool IsNonTemporal = false,
111 bool IsLastUse = false)
112 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
113 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
116 IsLastUse(IsLastUse) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SIMemOpInfo.
176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SIMemOpInfo.
182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if memory access of the machine instruction used to
205 /// create this SIMemOpInfo is last use, false otherwise.
206 bool isLastUse() const { return IsLastUse; }
207
208 /// \returns True if ordering constraint of the machine instruction used to
209 /// create this SIMemOpInfo is unordered or higher, false otherwise.
210 bool isAtomic() const {
211 return Ordering != AtomicOrdering::NotAtomic;
212 }
213
214};
215
216class SIMemOpAccess final {
217private:
218 AMDGPUMachineModuleInfo *MMI = nullptr;
219
220 /// Reports unsupported message \p Msg for \p MI to LLVM context.
221 void reportUnsupported(const MachineBasicBlock::iterator &MI,
222 const char *Msg) const;
223
224 /// Inspects the target synchronization scope \p SSID and determines
225 /// the SI atomic scope it corresponds to, the address spaces it
226 /// covers, and whether the memory ordering applies between address
227 /// spaces.
228 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
229 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
230
231 /// \return Return a bit set of the address spaces accessed by \p AS.
232 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
233
234 /// \returns Info constructed from \p MI, which has at least machine memory
235 /// operand.
236 std::optional<SIMemOpInfo>
237 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
238
239public:
240 /// Construct class to support accessing the machine memory operands
241 /// of instructions in the machine function \p MF.
242 SIMemOpAccess(MachineFunction &MF);
243
244 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
245 std::optional<SIMemOpInfo>
247
248 /// \returns Store info if \p MI is a store operation, "std::nullopt"
249 /// otherwise.
250 std::optional<SIMemOpInfo>
251 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
252
253 /// \returns Atomic fence info if \p MI is an atomic fence operation,
254 /// "std::nullopt" otherwise.
255 std::optional<SIMemOpInfo>
256 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
257
258 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
259 /// rmw operation, "std::nullopt" otherwise.
260 std::optional<SIMemOpInfo>
261 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
262};
263
264class SICacheControl {
265protected:
266
267 /// AMDGPU subtarget info.
268 const GCNSubtarget &ST;
269
270 /// Instruction info.
271 const SIInstrInfo *TII = nullptr;
272
273 IsaVersion IV;
274
275 /// Whether to insert cache invalidating instructions.
276 bool InsertCacheInv;
277
278 SICacheControl(const GCNSubtarget &ST);
279
280 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
281 /// \returns Returns true if \p MI is modified, false otherwise.
282 bool enableNamedBit(const MachineBasicBlock::iterator MI,
283 AMDGPU::CPol::CPol Bit) const;
284
285public:
286
287 /// Create a cache control for the subtarget \p ST.
288 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
289
290 /// Update \p MI memory load instruction to bypass any caches up to
291 /// the \p Scope memory scope for address spaces \p
292 /// AddrSpace. Return true iff the instruction was modified.
293 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
294 SIAtomicScope Scope,
295 SIAtomicAddrSpace AddrSpace) const = 0;
296
297 /// Update \p MI memory store instruction to bypass any caches up to
298 /// the \p Scope memory scope for address spaces \p
299 /// AddrSpace. Return true iff the instruction was modified.
300 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
301 SIAtomicScope Scope,
302 SIAtomicAddrSpace AddrSpace) const = 0;
303
304 /// Update \p MI memory read-modify-write instruction to bypass any caches up
305 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
306 /// iff the instruction was modified.
307 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
308 SIAtomicScope Scope,
309 SIAtomicAddrSpace AddrSpace) const = 0;
310
311 /// Update \p MI memory instruction of kind \p Op associated with address
312 /// spaces \p AddrSpace to indicate it is volatile and/or
313 /// nontemporal/last-use. Return true iff the instruction was modified.
314 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
315 SIAtomicAddrSpace AddrSpace,
316 SIMemOp Op, bool IsVolatile,
317 bool IsNonTemporal,
318 bool IsLastUse = false) const = 0;
319
320 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
321 return false;
322 };
323
324 /// Inserts any necessary instructions at position \p Pos relative
325 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
326 /// \p Op associated with address spaces \p AddrSpace have completed. Used
327 /// between memory instructions to enforce the order they become visible as
328 /// observed by other memory instructions executing in memory scope \p Scope.
329 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
330 /// address spaces. Returns true iff any instructions inserted.
331 virtual bool insertWait(MachineBasicBlock::iterator &MI,
332 SIAtomicScope Scope,
333 SIAtomicAddrSpace AddrSpace,
334 SIMemOp Op,
335 bool IsCrossAddrSpaceOrdering,
336 Position Pos) const = 0;
337
338 /// Inserts any necessary instructions at position \p Pos relative to
339 /// instruction \p MI to ensure any subsequent memory instructions of this
340 /// thread with address spaces \p AddrSpace will observe the previous memory
341 /// operations by any thread for memory scopes up to memory scope \p Scope .
342 /// Returns true iff any instructions inserted.
343 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
344 SIAtomicScope Scope,
345 SIAtomicAddrSpace AddrSpace,
346 Position Pos) const = 0;
347
348 /// Inserts any necessary instructions at position \p Pos relative to
349 /// instruction \p MI to ensure previous memory instructions by this thread
350 /// with address spaces \p AddrSpace have completed and can be observed by
351 /// subsequent memory instructions by any thread executing in memory scope \p
352 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
353 /// between address spaces. Returns true iff any instructions inserted.
354 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
355 SIAtomicScope Scope,
356 SIAtomicAddrSpace AddrSpace,
357 bool IsCrossAddrSpaceOrdering,
358 Position Pos) const = 0;
359
360 /// Virtual destructor to allow derivations to be deleted.
361 virtual ~SICacheControl() = default;
362
363 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
365 return false;
366 }
367};
368
369class SIGfx6CacheControl : public SICacheControl {
370protected:
371
372 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
373 /// is modified, false otherwise.
374 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
375 return enableNamedBit(MI, AMDGPU::CPol::GLC);
376 }
377
378 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
379 /// is modified, false otherwise.
380 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
381 return enableNamedBit(MI, AMDGPU::CPol::SLC);
382 }
383
384public:
385
386 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
387
388 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
389 SIAtomicScope Scope,
390 SIAtomicAddrSpace AddrSpace) const override;
391
392 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
393 SIAtomicScope Scope,
394 SIAtomicAddrSpace AddrSpace) const override;
395
396 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
397 SIAtomicScope Scope,
398 SIAtomicAddrSpace AddrSpace) const override;
399
400 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
402 bool IsVolatile, bool IsNonTemporal,
403 bool IsLastUse) const override;
404
405 bool insertWait(MachineBasicBlock::iterator &MI,
406 SIAtomicScope Scope,
407 SIAtomicAddrSpace AddrSpace,
408 SIMemOp Op,
409 bool IsCrossAddrSpaceOrdering,
410 Position Pos) const override;
411
412 bool insertAcquire(MachineBasicBlock::iterator &MI,
413 SIAtomicScope Scope,
414 SIAtomicAddrSpace AddrSpace,
415 Position Pos) const override;
416
417 bool insertRelease(MachineBasicBlock::iterator &MI,
418 SIAtomicScope Scope,
419 SIAtomicAddrSpace AddrSpace,
420 bool IsCrossAddrSpaceOrdering,
421 Position Pos) const override;
422};
423
424class SIGfx7CacheControl : public SIGfx6CacheControl {
425public:
426
427 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
428
429 bool insertAcquire(MachineBasicBlock::iterator &MI,
430 SIAtomicScope Scope,
431 SIAtomicAddrSpace AddrSpace,
432 Position Pos) const override;
433
434};
435
436class SIGfx90ACacheControl : public SIGfx7CacheControl {
437public:
438
439 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
440
441 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
442 SIAtomicScope Scope,
443 SIAtomicAddrSpace AddrSpace) const override;
444
445 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
446 SIAtomicScope Scope,
447 SIAtomicAddrSpace AddrSpace) const override;
448
449 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace) const override;
452
453 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
454 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455 bool IsVolatile, bool IsNonTemporal,
456 bool IsLastUse) const override;
457
458 bool insertWait(MachineBasicBlock::iterator &MI,
459 SIAtomicScope Scope,
460 SIAtomicAddrSpace AddrSpace,
461 SIMemOp Op,
462 bool IsCrossAddrSpaceOrdering,
463 Position Pos) const override;
464
465 bool insertAcquire(MachineBasicBlock::iterator &MI,
466 SIAtomicScope Scope,
467 SIAtomicAddrSpace AddrSpace,
468 Position Pos) const override;
469
470 bool insertRelease(MachineBasicBlock::iterator &MI,
471 SIAtomicScope Scope,
472 SIAtomicAddrSpace AddrSpace,
473 bool IsCrossAddrSpaceOrdering,
474 Position Pos) const override;
475};
476
477class SIGfx940CacheControl : public SIGfx90ACacheControl {
478protected:
479
480 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
481 /// is modified, false otherwise.
482 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
483 return enableNamedBit(MI, AMDGPU::CPol::SC0);
484 }
485
486 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
487 /// is modified, false otherwise.
488 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
489 return enableNamedBit(MI, AMDGPU::CPol::SC1);
490 }
491
492 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
493 /// is modified, false otherwise.
494 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
495 return enableNamedBit(MI, AMDGPU::CPol::NT);
496 }
497
498public:
499
500 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
501
502 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
503 SIAtomicScope Scope,
504 SIAtomicAddrSpace AddrSpace) const override;
505
506 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
507 SIAtomicScope Scope,
508 SIAtomicAddrSpace AddrSpace) const override;
509
510 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
511 SIAtomicScope Scope,
512 SIAtomicAddrSpace AddrSpace) const override;
513
514 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
515 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
516 bool IsVolatile, bool IsNonTemporal,
517 bool IsLastUse) const override;
518
519 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
520 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
521
522 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
523 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
524 Position Pos) const override;
525
526 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
527 MachineBasicBlock::iterator &MI) const override {
528 bool Changed = false;
529 if (ST.hasForceStoreSC0SC1() &&
530 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
531 SIAtomicAddrSpace::GLOBAL |
532 SIAtomicAddrSpace::OTHER)) !=
533 SIAtomicAddrSpace::NONE) {
534 Changed |= enableSC0Bit(MI);
535 Changed |= enableSC1Bit(MI);
536 }
537 return Changed;
538 }
539};
540
541class SIGfx10CacheControl : public SIGfx7CacheControl {
542protected:
543
544 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
545 /// is modified, false otherwise.
546 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
547 return enableNamedBit(MI, AMDGPU::CPol::DLC);
548 }
549
550public:
551
552 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
553
554 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
555 SIAtomicScope Scope,
556 SIAtomicAddrSpace AddrSpace) const override;
557
558 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
559 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
560 bool IsVolatile, bool IsNonTemporal,
561 bool IsLastUse) const override;
562
563 bool insertWait(MachineBasicBlock::iterator &MI,
564 SIAtomicScope Scope,
565 SIAtomicAddrSpace AddrSpace,
566 SIMemOp Op,
567 bool IsCrossAddrSpaceOrdering,
568 Position Pos) const override;
569
570 bool insertAcquire(MachineBasicBlock::iterator &MI,
571 SIAtomicScope Scope,
572 SIAtomicAddrSpace AddrSpace,
573 Position Pos) const override;
574};
575
576class SIGfx11CacheControl : public SIGfx10CacheControl {
577public:
578 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
579
580 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
581 SIAtomicScope Scope,
582 SIAtomicAddrSpace AddrSpace) const override;
583
584 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
585 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
586 bool IsVolatile, bool IsNonTemporal,
587 bool IsLastUse) const override;
588};
589
590class SIGfx12CacheControl : public SIGfx11CacheControl {
591protected:
592 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
593 // \returns Returns true if \p MI is modified, false otherwise.
594 bool setTH(const MachineBasicBlock::iterator MI,
596 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
597 // MI. \returns Returns true if \p MI is modified, false otherwise.
598 bool setScope(const MachineBasicBlock::iterator MI,
600
601 // Stores with system scope (SCOPE_SYS) need to wait for:
602 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
603 // - non-returning-atomics - wait for STORECNT==0
604 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
605 // since it does not distinguish atomics-with-return from regular stores.
606 // There is no need to wait if memory is cached (mtype != UC).
607 bool
608 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
609
610public:
611 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
612
613 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
614 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
615 bool IsCrossAddrSpaceOrdering, Position Pos) const override;
616
617 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
618 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
619
620 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
621 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
622 bool IsVolatile, bool IsNonTemporal,
623 bool IsLastUse) const override;
624
625 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
626};
627
628class SIMemoryLegalizer final : public MachineFunctionPass {
629private:
630
631 /// Cache Control.
632 std::unique_ptr<SICacheControl> CC = nullptr;
633
634 /// List of atomic pseudo instructions.
635 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
636
637 /// Return true iff instruction \p MI is a atomic instruction that
638 /// returns a result.
639 bool isAtomicRet(const MachineInstr &MI) const {
641 }
642
643 /// Removes all processed atomic pseudo instructions from the current
644 /// function. Returns true if current function is modified, false otherwise.
645 bool removeAtomicPseudoMIs();
646
647 /// Expands load operation \p MI. Returns true if instructions are
648 /// added/deleted or \p MI is modified, false otherwise.
649 bool expandLoad(const SIMemOpInfo &MOI,
651 /// Expands store operation \p MI. Returns true if instructions are
652 /// added/deleted or \p MI is modified, false otherwise.
653 bool expandStore(const SIMemOpInfo &MOI,
655 /// Expands atomic fence operation \p MI. Returns true if
656 /// instructions are added/deleted or \p MI is modified, false otherwise.
657 bool expandAtomicFence(const SIMemOpInfo &MOI,
659 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
660 /// instructions are added/deleted or \p MI is modified, false otherwise.
661 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
663
664public:
665 static char ID;
666
667 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
668
669 void getAnalysisUsage(AnalysisUsage &AU) const override {
670 AU.setPreservesCFG();
672 }
673
674 StringRef getPassName() const override {
675 return PASS_NAME;
676 }
677
678 bool runOnMachineFunction(MachineFunction &MF) override;
679};
680
681} // end namespace anonymous
682
683void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
684 const char *Msg) const {
685 const Function &Func = MI->getParent()->getParent()->getFunction();
686 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
687 Func.getContext().diagnose(Diag);
688}
689
690std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
691SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
692 SIAtomicAddrSpace InstrAddrSpace) const {
693 if (SSID == SyncScope::System)
694 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
695 if (SSID == MMI->getAgentSSID())
696 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
697 if (SSID == MMI->getWorkgroupSSID())
698 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
699 true);
700 if (SSID == MMI->getWavefrontSSID())
701 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
702 true);
703 if (SSID == SyncScope::SingleThread)
704 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
705 true);
706 if (SSID == MMI->getSystemOneAddressSpaceSSID())
707 return std::tuple(SIAtomicScope::SYSTEM,
708 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
709 if (SSID == MMI->getAgentOneAddressSpaceSSID())
710 return std::tuple(SIAtomicScope::AGENT,
711 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
712 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
713 return std::tuple(SIAtomicScope::WORKGROUP,
714 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
715 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
716 return std::tuple(SIAtomicScope::WAVEFRONT,
717 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
718 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
719 return std::tuple(SIAtomicScope::SINGLETHREAD,
720 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
721 return std::nullopt;
722}
723
724SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
725 if (AS == AMDGPUAS::FLAT_ADDRESS)
726 return SIAtomicAddrSpace::FLAT;
727 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
728 return SIAtomicAddrSpace::GLOBAL;
729 if (AS == AMDGPUAS::LOCAL_ADDRESS)
730 return SIAtomicAddrSpace::LDS;
732 return SIAtomicAddrSpace::SCRATCH;
733 if (AS == AMDGPUAS::REGION_ADDRESS)
734 return SIAtomicAddrSpace::GDS;
735
736 return SIAtomicAddrSpace::OTHER;
737}
738
739SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
741}
742
743std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
744 const MachineBasicBlock::iterator &MI) const {
745 assert(MI->getNumMemOperands() > 0);
746
748 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
749 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
750 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
751 bool IsNonTemporal = true;
752 bool IsVolatile = false;
753 bool IsLastUse = false;
754
755 // Validator should check whether or not MMOs cover the entire set of
756 // locations accessed by the memory instruction.
757 for (const auto &MMO : MI->memoperands()) {
758 IsNonTemporal &= MMO->isNonTemporal();
759 IsVolatile |= MMO->isVolatile();
760 IsLastUse |= MMO->getFlags() & MOLastUse;
761 InstrAddrSpace |=
762 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
763 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
764 if (OpOrdering != AtomicOrdering::NotAtomic) {
765 const auto &IsSyncScopeInclusion =
766 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
767 if (!IsSyncScopeInclusion) {
768 reportUnsupported(MI,
769 "Unsupported non-inclusive atomic synchronization scope");
770 return std::nullopt;
771 }
772
773 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
774 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
775 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
776 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
777 FailureOrdering =
778 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
779 }
780 }
781
782 SIAtomicScope Scope = SIAtomicScope::NONE;
783 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
784 bool IsCrossAddressSpaceOrdering = false;
785 if (Ordering != AtomicOrdering::NotAtomic) {
786 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
787 if (!ScopeOrNone) {
788 reportUnsupported(MI, "Unsupported atomic synchronization scope");
789 return std::nullopt;
790 }
791 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
792 *ScopeOrNone;
793 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
794 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
795 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
796 reportUnsupported(MI, "Unsupported atomic address space");
797 return std::nullopt;
798 }
799 }
800 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
801 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
802 IsNonTemporal, IsLastUse);
803}
804
805std::optional<SIMemOpInfo>
806SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
807 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
808
809 if (!(MI->mayLoad() && !MI->mayStore()))
810 return std::nullopt;
811
812 // Be conservative if there are no memory operands.
813 if (MI->getNumMemOperands() == 0)
814 return SIMemOpInfo();
815
816 return constructFromMIWithMMO(MI);
817}
818
819std::optional<SIMemOpInfo>
820SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
821 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
822
823 if (!(!MI->mayLoad() && MI->mayStore()))
824 return std::nullopt;
825
826 // Be conservative if there are no memory operands.
827 if (MI->getNumMemOperands() == 0)
828 return SIMemOpInfo();
829
830 return constructFromMIWithMMO(MI);
831}
832
833std::optional<SIMemOpInfo>
834SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
835 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
836
837 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
838 return std::nullopt;
839
840 AtomicOrdering Ordering =
841 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
842
843 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
844 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
845 if (!ScopeOrNone) {
846 reportUnsupported(MI, "Unsupported atomic synchronization scope");
847 return std::nullopt;
848 }
849
850 SIAtomicScope Scope = SIAtomicScope::NONE;
851 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
852 bool IsCrossAddressSpaceOrdering = false;
853 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
854 *ScopeOrNone;
855
856 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
857 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
858 reportUnsupported(MI, "Unsupported atomic address space");
859 return std::nullopt;
860 }
861
862 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
863 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
864}
865
866std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
867 const MachineBasicBlock::iterator &MI) const {
868 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
869
870 if (!(MI->mayLoad() && MI->mayStore()))
871 return std::nullopt;
872
873 // Be conservative if there are no memory operands.
874 if (MI->getNumMemOperands() == 0)
875 return SIMemOpInfo();
876
877 return constructFromMIWithMMO(MI);
878}
879
880SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
881 TII = ST.getInstrInfo();
882 IV = getIsaVersion(ST.getCPU());
883 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
884}
885
886bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
887 AMDGPU::CPol::CPol Bit) const {
888 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
889 if (!CPol)
890 return false;
891
892 CPol->setImm(CPol->getImm() | Bit);
893 return true;
894}
895
896/* static */
897std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
898 GCNSubtarget::Generation Generation = ST.getGeneration();
899 if (ST.hasGFX940Insts())
900 return std::make_unique<SIGfx940CacheControl>(ST);
901 if (ST.hasGFX90AInsts())
902 return std::make_unique<SIGfx90ACacheControl>(ST);
903 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
904 return std::make_unique<SIGfx6CacheControl>(ST);
905 if (Generation < AMDGPUSubtarget::GFX10)
906 return std::make_unique<SIGfx7CacheControl>(ST);
907 if (Generation < AMDGPUSubtarget::GFX11)
908 return std::make_unique<SIGfx10CacheControl>(ST);
909 if (Generation < AMDGPUSubtarget::GFX12)
910 return std::make_unique<SIGfx11CacheControl>(ST);
911 return std::make_unique<SIGfx12CacheControl>(ST);
912}
913
914bool SIGfx6CacheControl::enableLoadCacheBypass(
916 SIAtomicScope Scope,
917 SIAtomicAddrSpace AddrSpace) const {
918 assert(MI->mayLoad() && !MI->mayStore());
919 bool Changed = false;
920
921 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
922 switch (Scope) {
923 case SIAtomicScope::SYSTEM:
924 case SIAtomicScope::AGENT:
925 // Set L1 cache policy to MISS_EVICT.
926 // Note: there is no L2 cache bypass policy at the ISA level.
927 Changed |= enableGLCBit(MI);
928 break;
929 case SIAtomicScope::WORKGROUP:
930 case SIAtomicScope::WAVEFRONT:
931 case SIAtomicScope::SINGLETHREAD:
932 // No cache to bypass.
933 break;
934 default:
935 llvm_unreachable("Unsupported synchronization scope");
936 }
937 }
938
939 /// The scratch address space does not need the global memory caches
940 /// to be bypassed as all memory operations by the same thread are
941 /// sequentially consistent, and no other thread can access scratch
942 /// memory.
943
944 /// Other address spaces do not have a cache.
945
946 return Changed;
947}
948
949bool SIGfx6CacheControl::enableStoreCacheBypass(
951 SIAtomicScope Scope,
952 SIAtomicAddrSpace AddrSpace) const {
953 assert(!MI->mayLoad() && MI->mayStore());
954 bool Changed = false;
955
956 /// The L1 cache is write through so does not need to be bypassed. There is no
957 /// bypass control for the L2 cache at the isa level.
958
959 return Changed;
960}
961
962bool SIGfx6CacheControl::enableRMWCacheBypass(
964 SIAtomicScope Scope,
965 SIAtomicAddrSpace AddrSpace) const {
966 assert(MI->mayLoad() && MI->mayStore());
967 bool Changed = false;
968
969 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
970 /// bypassed, and the GLC bit is instead used to indicate if they are
971 /// return or no-return.
972 /// Note: there is no L2 cache coherent bypass control at the ISA level.
973
974 return Changed;
975}
976
977bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
978 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
979 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
980 // Only handle load and store, not atomic read-modify-write insructions. The
981 // latter use glc to indicate if the atomic returns a result and so must not
982 // be used for cache control.
983 assert(MI->mayLoad() ^ MI->mayStore());
984
985 // Only update load and store, not LLVM IR atomic read-modify-write
986 // instructions. The latter are always marked as volatile so cannot sensibly
987 // handle it as do not want to pessimize all atomics. Also they do not support
988 // the nontemporal attribute.
989 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
990
991 bool Changed = false;
992
993 if (IsVolatile) {
994 // Set L1 cache policy to be MISS_EVICT for load instructions
995 // and MISS_LRU for store instructions.
996 // Note: there is no L2 cache bypass policy at the ISA level.
997 if (Op == SIMemOp::LOAD)
998 Changed |= enableGLCBit(MI);
999
1000 // Ensure operation has completed at system scope to cause all volatile
1001 // operations to be visible outside the program in a global order. Do not
1002 // request cross address space as only the global address space can be
1003 // observable outside the program, so no need to cause a waitcnt for LDS
1004 // address space operations.
1005 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1006 Position::AFTER);
1007
1008 return Changed;
1009 }
1010
1011 if (IsNonTemporal) {
1012 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1013 // for both loads and stores, and the L2 cache policy to STREAM.
1014 Changed |= enableGLCBit(MI);
1015 Changed |= enableSLCBit(MI);
1016 return Changed;
1017 }
1018
1019 return Changed;
1020}
1021
1022bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1023 SIAtomicScope Scope,
1024 SIAtomicAddrSpace AddrSpace,
1025 SIMemOp Op,
1026 bool IsCrossAddrSpaceOrdering,
1027 Position Pos) const {
1028 bool Changed = false;
1029
1030 MachineBasicBlock &MBB = *MI->getParent();
1031 DebugLoc DL = MI->getDebugLoc();
1032
1033 if (Pos == Position::AFTER)
1034 ++MI;
1035
1036 bool VMCnt = false;
1037 bool LGKMCnt = false;
1038
1039 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1040 SIAtomicAddrSpace::NONE) {
1041 switch (Scope) {
1042 case SIAtomicScope::SYSTEM:
1043 case SIAtomicScope::AGENT:
1044 VMCnt |= true;
1045 break;
1046 case SIAtomicScope::WORKGROUP:
1047 case SIAtomicScope::WAVEFRONT:
1048 case SIAtomicScope::SINGLETHREAD:
1049 // The L1 cache keeps all memory operations in order for
1050 // wavefronts in the same work-group.
1051 break;
1052 default:
1053 llvm_unreachable("Unsupported synchronization scope");
1054 }
1055 }
1056
1057 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1058 switch (Scope) {
1059 case SIAtomicScope::SYSTEM:
1060 case SIAtomicScope::AGENT:
1061 case SIAtomicScope::WORKGROUP:
1062 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1063 // not needed as LDS operations for all waves are executed in a total
1064 // global ordering as observed by all waves. Required if also
1065 // synchronizing with global/GDS memory as LDS operations could be
1066 // reordered with respect to later global/GDS memory operations of the
1067 // same wave.
1068 LGKMCnt |= IsCrossAddrSpaceOrdering;
1069 break;
1070 case SIAtomicScope::WAVEFRONT:
1071 case SIAtomicScope::SINGLETHREAD:
1072 // The LDS keeps all memory operations in order for
1073 // the same wavefront.
1074 break;
1075 default:
1076 llvm_unreachable("Unsupported synchronization scope");
1077 }
1078 }
1079
1080 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1081 switch (Scope) {
1082 case SIAtomicScope::SYSTEM:
1083 case SIAtomicScope::AGENT:
1084 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1085 // is not needed as GDS operations for all waves are executed in a total
1086 // global ordering as observed by all waves. Required if also
1087 // synchronizing with global/LDS memory as GDS operations could be
1088 // reordered with respect to later global/LDS memory operations of the
1089 // same wave.
1090 LGKMCnt |= IsCrossAddrSpaceOrdering;
1091 break;
1092 case SIAtomicScope::WORKGROUP:
1093 case SIAtomicScope::WAVEFRONT:
1094 case SIAtomicScope::SINGLETHREAD:
1095 // The GDS keeps all memory operations in order for
1096 // the same work-group.
1097 break;
1098 default:
1099 llvm_unreachable("Unsupported synchronization scope");
1100 }
1101 }
1102
1103 if (VMCnt || LGKMCnt) {
1104 unsigned WaitCntImmediate =
1106 VMCnt ? 0 : getVmcntBitMask(IV),
1108 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1109 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1110 .addImm(WaitCntImmediate);
1111 Changed = true;
1112 }
1113
1114 if (Pos == Position::AFTER)
1115 --MI;
1116
1117 return Changed;
1118}
1119
1120bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1121 SIAtomicScope Scope,
1122 SIAtomicAddrSpace AddrSpace,
1123 Position Pos) const {
1124 if (!InsertCacheInv)
1125 return false;
1126
1127 bool Changed = false;
1128
1129 MachineBasicBlock &MBB = *MI->getParent();
1130 DebugLoc DL = MI->getDebugLoc();
1131
1132 if (Pos == Position::AFTER)
1133 ++MI;
1134
1135 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1136 switch (Scope) {
1137 case SIAtomicScope::SYSTEM:
1138 case SIAtomicScope::AGENT:
1139 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1140 Changed = true;
1141 break;
1142 case SIAtomicScope::WORKGROUP:
1143 case SIAtomicScope::WAVEFRONT:
1144 case SIAtomicScope::SINGLETHREAD:
1145 // No cache to invalidate.
1146 break;
1147 default:
1148 llvm_unreachable("Unsupported synchronization scope");
1149 }
1150 }
1151
1152 /// The scratch address space does not need the global memory cache
1153 /// to be flushed as all memory operations by the same thread are
1154 /// sequentially consistent, and no other thread can access scratch
1155 /// memory.
1156
1157 /// Other address spaces do not have a cache.
1158
1159 if (Pos == Position::AFTER)
1160 --MI;
1161
1162 return Changed;
1163}
1164
1165bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1166 SIAtomicScope Scope,
1167 SIAtomicAddrSpace AddrSpace,
1168 bool IsCrossAddrSpaceOrdering,
1169 Position Pos) const {
1170 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1171 IsCrossAddrSpaceOrdering, Pos);
1172}
1173
1174bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1175 SIAtomicScope Scope,
1176 SIAtomicAddrSpace AddrSpace,
1177 Position Pos) const {
1178 if (!InsertCacheInv)
1179 return false;
1180
1181 bool Changed = false;
1182
1183 MachineBasicBlock &MBB = *MI->getParent();
1184 DebugLoc DL = MI->getDebugLoc();
1185
1187
1188 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1189 ? AMDGPU::BUFFER_WBINVL1
1190 : AMDGPU::BUFFER_WBINVL1_VOL;
1191
1192 if (Pos == Position::AFTER)
1193 ++MI;
1194
1195 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1196 switch (Scope) {
1197 case SIAtomicScope::SYSTEM:
1198 case SIAtomicScope::AGENT:
1199 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1200 Changed = true;
1201 break;
1202 case SIAtomicScope::WORKGROUP:
1203 case SIAtomicScope::WAVEFRONT:
1204 case SIAtomicScope::SINGLETHREAD:
1205 // No cache to invalidate.
1206 break;
1207 default:
1208 llvm_unreachable("Unsupported synchronization scope");
1209 }
1210 }
1211
1212 /// The scratch address space does not need the global memory cache
1213 /// to be flushed as all memory operations by the same thread are
1214 /// sequentially consistent, and no other thread can access scratch
1215 /// memory.
1216
1217 /// Other address spaces do not have a cache.
1218
1219 if (Pos == Position::AFTER)
1220 --MI;
1221
1222 return Changed;
1223}
1224
1225bool SIGfx90ACacheControl::enableLoadCacheBypass(
1227 SIAtomicScope Scope,
1228 SIAtomicAddrSpace AddrSpace) const {
1229 assert(MI->mayLoad() && !MI->mayStore());
1230 bool Changed = false;
1231
1232 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1233 switch (Scope) {
1234 case SIAtomicScope::SYSTEM:
1235 case SIAtomicScope::AGENT:
1236 // Set the L1 cache policy to MISS_LRU.
1237 // Note: there is no L2 cache bypass policy at the ISA level.
1238 Changed |= enableGLCBit(MI);
1239 break;
1240 case SIAtomicScope::WORKGROUP:
1241 // In threadgroup split mode the waves of a work-group can be executing on
1242 // different CUs. Therefore need to bypass the L1 which is per CU.
1243 // Otherwise in non-threadgroup split mode all waves of a work-group are
1244 // on the same CU, and so the L1 does not need to be bypassed.
1245 if (ST.isTgSplitEnabled())
1246 Changed |= enableGLCBit(MI);
1247 break;
1248 case SIAtomicScope::WAVEFRONT:
1249 case SIAtomicScope::SINGLETHREAD:
1250 // No cache to bypass.
1251 break;
1252 default:
1253 llvm_unreachable("Unsupported synchronization scope");
1254 }
1255 }
1256
1257 /// The scratch address space does not need the global memory caches
1258 /// to be bypassed as all memory operations by the same thread are
1259 /// sequentially consistent, and no other thread can access scratch
1260 /// memory.
1261
1262 /// Other address spaces do not have a cache.
1263
1264 return Changed;
1265}
1266
1267bool SIGfx90ACacheControl::enableStoreCacheBypass(
1269 SIAtomicScope Scope,
1270 SIAtomicAddrSpace AddrSpace) const {
1271 assert(!MI->mayLoad() && MI->mayStore());
1272 bool Changed = false;
1273
1274 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1275 switch (Scope) {
1276 case SIAtomicScope::SYSTEM:
1277 case SIAtomicScope::AGENT:
1278 /// Do not set glc for store atomic operations as they implicitly write
1279 /// through the L1 cache.
1280 break;
1281 case SIAtomicScope::WORKGROUP:
1282 case SIAtomicScope::WAVEFRONT:
1283 case SIAtomicScope::SINGLETHREAD:
1284 // No cache to bypass. Store atomics implicitly write through the L1
1285 // cache.
1286 break;
1287 default:
1288 llvm_unreachable("Unsupported synchronization scope");
1289 }
1290 }
1291
1292 /// The scratch address space does not need the global memory caches
1293 /// to be bypassed as all memory operations by the same thread are
1294 /// sequentially consistent, and no other thread can access scratch
1295 /// memory.
1296
1297 /// Other address spaces do not have a cache.
1298
1299 return Changed;
1300}
1301
1302bool SIGfx90ACacheControl::enableRMWCacheBypass(
1304 SIAtomicScope Scope,
1305 SIAtomicAddrSpace AddrSpace) const {
1306 assert(MI->mayLoad() && MI->mayStore());
1307 bool Changed = false;
1308
1309 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1310 switch (Scope) {
1311 case SIAtomicScope::SYSTEM:
1312 case SIAtomicScope::AGENT:
1313 /// Do not set glc for RMW atomic operations as they implicitly bypass
1314 /// the L1 cache, and the glc bit is instead used to indicate if they are
1315 /// return or no-return.
1316 break;
1317 case SIAtomicScope::WORKGROUP:
1318 case SIAtomicScope::WAVEFRONT:
1319 case SIAtomicScope::SINGLETHREAD:
1320 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1321 break;
1322 default:
1323 llvm_unreachable("Unsupported synchronization scope");
1324 }
1325 }
1326
1327 return Changed;
1328}
1329
1330bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1331 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1332 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1333 // Only handle load and store, not atomic read-modify-write insructions. The
1334 // latter use glc to indicate if the atomic returns a result and so must not
1335 // be used for cache control.
1336 assert(MI->mayLoad() ^ MI->mayStore());
1337
1338 // Only update load and store, not LLVM IR atomic read-modify-write
1339 // instructions. The latter are always marked as volatile so cannot sensibly
1340 // handle it as do not want to pessimize all atomics. Also they do not support
1341 // the nontemporal attribute.
1342 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1343
1344 bool Changed = false;
1345
1346 if (IsVolatile) {
1347 // Set L1 cache policy to be MISS_EVICT for load instructions
1348 // and MISS_LRU for store instructions.
1349 // Note: there is no L2 cache bypass policy at the ISA level.
1350 if (Op == SIMemOp::LOAD)
1351 Changed |= enableGLCBit(MI);
1352
1353 // Ensure operation has completed at system scope to cause all volatile
1354 // operations to be visible outside the program in a global order. Do not
1355 // request cross address space as only the global address space can be
1356 // observable outside the program, so no need to cause a waitcnt for LDS
1357 // address space operations.
1358 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1359 Position::AFTER);
1360
1361 return Changed;
1362 }
1363
1364 if (IsNonTemporal) {
1365 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1366 // for both loads and stores, and the L2 cache policy to STREAM.
1367 Changed |= enableGLCBit(MI);
1368 Changed |= enableSLCBit(MI);
1369 return Changed;
1370 }
1371
1372 return Changed;
1373}
1374
1375bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1376 SIAtomicScope Scope,
1377 SIAtomicAddrSpace AddrSpace,
1378 SIMemOp Op,
1379 bool IsCrossAddrSpaceOrdering,
1380 Position Pos) const {
1381 if (ST.isTgSplitEnabled()) {
1382 // In threadgroup split mode the waves of a work-group can be executing on
1383 // different CUs. Therefore need to wait for global or GDS memory operations
1384 // to complete to ensure they are visible to waves in the other CUs.
1385 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1386 // the same CU, so no need to wait for global memory as all waves in the
1387 // work-group access the same the L1, nor wait for GDS as access are ordered
1388 // on a CU.
1389 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1390 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1391 (Scope == SIAtomicScope::WORKGROUP)) {
1392 // Same as GFX7 using agent scope.
1393 Scope = SIAtomicScope::AGENT;
1394 }
1395 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1396 // LDS memory operations.
1397 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1398 }
1399 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1400 IsCrossAddrSpaceOrdering, Pos);
1401}
1402
1403bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1404 SIAtomicScope Scope,
1405 SIAtomicAddrSpace AddrSpace,
1406 Position Pos) const {
1407 if (!InsertCacheInv)
1408 return false;
1409
1410 bool Changed = false;
1411
1412 MachineBasicBlock &MBB = *MI->getParent();
1413 DebugLoc DL = MI->getDebugLoc();
1414
1415 if (Pos == Position::AFTER)
1416 ++MI;
1417
1418 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1419 switch (Scope) {
1420 case SIAtomicScope::SYSTEM:
1421 // Ensures that following loads will not see stale remote VMEM data or
1422 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1423 // CC will never be stale due to the local memory probes.
1424 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1425 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1426 // hardware does not reorder memory operations by the same wave with
1427 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1428 // remove any cache lines of earlier writes by the same wave and ensures
1429 // later reads by the same wave will refetch the cache lines.
1430 Changed = true;
1431 break;
1432 case SIAtomicScope::AGENT:
1433 // Same as GFX7.
1434 break;
1435 case SIAtomicScope::WORKGROUP:
1436 // In threadgroup split mode the waves of a work-group can be executing on
1437 // different CUs. Therefore need to invalidate the L1 which is per CU.
1438 // Otherwise in non-threadgroup split mode all waves of a work-group are
1439 // on the same CU, and so the L1 does not need to be invalidated.
1440 if (ST.isTgSplitEnabled()) {
1441 // Same as GFX7 using agent scope.
1442 Scope = SIAtomicScope::AGENT;
1443 }
1444 break;
1445 case SIAtomicScope::WAVEFRONT:
1446 case SIAtomicScope::SINGLETHREAD:
1447 // Same as GFX7.
1448 break;
1449 default:
1450 llvm_unreachable("Unsupported synchronization scope");
1451 }
1452 }
1453
1454 /// The scratch address space does not need the global memory cache
1455 /// to be flushed as all memory operations by the same thread are
1456 /// sequentially consistent, and no other thread can access scratch
1457 /// memory.
1458
1459 /// Other address spaces do not have a cache.
1460
1461 if (Pos == Position::AFTER)
1462 --MI;
1463
1464 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1465
1466 return Changed;
1467}
1468
1469bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1470 SIAtomicScope Scope,
1471 SIAtomicAddrSpace AddrSpace,
1472 bool IsCrossAddrSpaceOrdering,
1473 Position Pos) const {
1474 bool Changed = false;
1475
1476 MachineBasicBlock &MBB = *MI->getParent();
1477 const DebugLoc &DL = MI->getDebugLoc();
1478
1479 if (Pos == Position::AFTER)
1480 ++MI;
1481
1482 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1483 switch (Scope) {
1484 case SIAtomicScope::SYSTEM:
1485 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1486 // hardware does not reorder memory operations by the same wave with
1487 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1488 // to initiate writeback of any dirty cache lines of earlier writes by the
1489 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1490 // writeback has completed.
1491 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1492 // Set SC bits to indicate system scope.
1494 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1495 // vmcnt(0)" needed by the "BUFFER_WBL2".
1496 Changed = true;
1497 break;
1498 case SIAtomicScope::AGENT:
1499 case SIAtomicScope::WORKGROUP:
1500 case SIAtomicScope::WAVEFRONT:
1501 case SIAtomicScope::SINGLETHREAD:
1502 // Same as GFX7.
1503 break;
1504 default:
1505 llvm_unreachable("Unsupported synchronization scope");
1506 }
1507 }
1508
1509 if (Pos == Position::AFTER)
1510 --MI;
1511
1512 Changed |=
1513 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1514 IsCrossAddrSpaceOrdering, Pos);
1515
1516 return Changed;
1517}
1518
1519bool SIGfx940CacheControl::enableLoadCacheBypass(
1520 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1521 SIAtomicAddrSpace AddrSpace) const {
1522 assert(MI->mayLoad() && !MI->mayStore());
1523 bool Changed = false;
1524
1525 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1526 switch (Scope) {
1527 case SIAtomicScope::SYSTEM:
1528 // Set SC bits to indicate system scope.
1529 Changed |= enableSC0Bit(MI);
1530 Changed |= enableSC1Bit(MI);
1531 break;
1532 case SIAtomicScope::AGENT:
1533 // Set SC bits to indicate agent scope.
1534 Changed |= enableSC1Bit(MI);
1535 break;
1536 case SIAtomicScope::WORKGROUP:
1537 // In threadgroup split mode the waves of a work-group can be executing on
1538 // different CUs. Therefore need to bypass the L1 which is per CU.
1539 // Otherwise in non-threadgroup split mode all waves of a work-group are
1540 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1541 // bits to indicate work-group scope will do this automatically.
1542 Changed |= enableSC0Bit(MI);
1543 break;
1544 case SIAtomicScope::WAVEFRONT:
1545 case SIAtomicScope::SINGLETHREAD:
1546 // Leave SC bits unset to indicate wavefront scope.
1547 break;
1548 default:
1549 llvm_unreachable("Unsupported synchronization scope");
1550 }
1551 }
1552
1553 /// The scratch address space does not need the global memory caches
1554 /// to be bypassed as all memory operations by the same thread are
1555 /// sequentially consistent, and no other thread can access scratch
1556 /// memory.
1557
1558 /// Other address spaces do not have a cache.
1559
1560 return Changed;
1561}
1562
1563bool SIGfx940CacheControl::enableStoreCacheBypass(
1565 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1566 assert(!MI->mayLoad() && MI->mayStore());
1567 bool Changed = false;
1568
1569 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1570 switch (Scope) {
1571 case SIAtomicScope::SYSTEM:
1572 // Set SC bits to indicate system scope.
1573 Changed |= enableSC0Bit(MI);
1574 Changed |= enableSC1Bit(MI);
1575 break;
1576 case SIAtomicScope::AGENT:
1577 // Set SC bits to indicate agent scope.
1578 Changed |= enableSC1Bit(MI);
1579 break;
1580 case SIAtomicScope::WORKGROUP:
1581 // Set SC bits to indicate workgroup scope.
1582 Changed |= enableSC0Bit(MI);
1583 break;
1584 case SIAtomicScope::WAVEFRONT:
1585 case SIAtomicScope::SINGLETHREAD:
1586 // Leave SC bits unset to indicate wavefront scope.
1587 break;
1588 default:
1589 llvm_unreachable("Unsupported synchronization scope");
1590 }
1591 }
1592
1593 /// The scratch address space does not need the global memory caches
1594 /// to be bypassed as all memory operations by the same thread are
1595 /// sequentially consistent, and no other thread can access scratch
1596 /// memory.
1597
1598 /// Other address spaces do not have a cache.
1599
1600 return Changed;
1601}
1602
1603bool SIGfx940CacheControl::enableRMWCacheBypass(
1604 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1605 SIAtomicAddrSpace AddrSpace) const {
1606 assert(MI->mayLoad() && MI->mayStore());
1607 bool Changed = false;
1608
1609 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1610 switch (Scope) {
1611 case SIAtomicScope::SYSTEM:
1612 // Set SC1 bit to indicate system scope.
1613 Changed |= enableSC1Bit(MI);
1614 break;
1615 case SIAtomicScope::AGENT:
1616 case SIAtomicScope::WORKGROUP:
1617 case SIAtomicScope::WAVEFRONT:
1618 case SIAtomicScope::SINGLETHREAD:
1619 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1620 // to indicate system or agent scope. The SC0 bit is used to indicate if
1621 // they are return or no-return. Leave SC1 bit unset to indicate agent
1622 // scope.
1623 break;
1624 default:
1625 llvm_unreachable("Unsupported synchronization scope");
1626 }
1627 }
1628
1629 return Changed;
1630}
1631
1632bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1633 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1634 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1635 // Only handle load and store, not atomic read-modify-write insructions. The
1636 // latter use glc to indicate if the atomic returns a result and so must not
1637 // be used for cache control.
1638 assert(MI->mayLoad() ^ MI->mayStore());
1639
1640 // Only update load and store, not LLVM IR atomic read-modify-write
1641 // instructions. The latter are always marked as volatile so cannot sensibly
1642 // handle it as do not want to pessimize all atomics. Also they do not support
1643 // the nontemporal attribute.
1644 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1645
1646 bool Changed = false;
1647
1648 if (IsVolatile) {
1649 // Set SC bits to indicate system scope.
1650 Changed |= enableSC0Bit(MI);
1651 Changed |= enableSC1Bit(MI);
1652
1653 // Ensure operation has completed at system scope to cause all volatile
1654 // operations to be visible outside the program in a global order. Do not
1655 // request cross address space as only the global address space can be
1656 // observable outside the program, so no need to cause a waitcnt for LDS
1657 // address space operations.
1658 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1659 Position::AFTER);
1660
1661 return Changed;
1662 }
1663
1664 if (IsNonTemporal) {
1665 Changed |= enableNTBit(MI);
1666 return Changed;
1667 }
1668
1669 return Changed;
1670}
1671
1672bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1673 SIAtomicScope Scope,
1674 SIAtomicAddrSpace AddrSpace,
1675 Position Pos) const {
1676 if (!InsertCacheInv)
1677 return false;
1678
1679 bool Changed = false;
1680
1681 MachineBasicBlock &MBB = *MI->getParent();
1682 DebugLoc DL = MI->getDebugLoc();
1683
1684 if (Pos == Position::AFTER)
1685 ++MI;
1686
1687 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1688 switch (Scope) {
1689 case SIAtomicScope::SYSTEM:
1690 // Ensures that following loads will not see stale remote VMEM data or
1691 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1692 // CC will never be stale due to the local memory probes.
1693 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1694 // Set SC bits to indicate system scope.
1696 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1697 // hardware does not reorder memory operations by the same wave with
1698 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1699 // remove any cache lines of earlier writes by the same wave and ensures
1700 // later reads by the same wave will refetch the cache lines.
1701 Changed = true;
1702 break;
1703 case SIAtomicScope::AGENT:
1704 // Ensures that following loads will not see stale remote date or local
1705 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1706 // due to the memory probes.
1707 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1708 // Set SC bits to indicate agent scope.
1710 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1711 // does not reorder memory operations with respect to preceeding buffer
1712 // invalidate. The invalidate is guaranteed to remove any cache lines of
1713 // earlier writes and ensures later writes will refetch the cache lines.
1714 Changed = true;
1715 break;
1716 case SIAtomicScope::WORKGROUP:
1717 // In threadgroup split mode the waves of a work-group can be executing on
1718 // different CUs. Therefore need to invalidate the L1 which is per CU.
1719 // Otherwise in non-threadgroup split mode all waves of a work-group are
1720 // on the same CU, and so the L1 does not need to be invalidated.
1721 if (ST.isTgSplitEnabled()) {
1722 // Ensures L1 is invalidated if in threadgroup split mode. In
1723 // non-threadgroup split mode it is a NOP, but no point generating it in
1724 // that case if know not in that mode.
1725 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1726 // Set SC bits to indicate work-group scope.
1728 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1729 // does not reorder memory operations with respect to preceeding buffer
1730 // invalidate. The invalidate is guaranteed to remove any cache lines of
1731 // earlier writes and ensures later writes will refetch the cache lines.
1732 Changed = true;
1733 }
1734 break;
1735 case SIAtomicScope::WAVEFRONT:
1736 case SIAtomicScope::SINGLETHREAD:
1737 // Could generate "BUFFER_INV" but it would do nothing as there are no
1738 // caches to invalidate.
1739 break;
1740 default:
1741 llvm_unreachable("Unsupported synchronization scope");
1742 }
1743 }
1744
1745 /// The scratch address space does not need the global memory cache
1746 /// to be flushed as all memory operations by the same thread are
1747 /// sequentially consistent, and no other thread can access scratch
1748 /// memory.
1749
1750 /// Other address spaces do not have a cache.
1751
1752 if (Pos == Position::AFTER)
1753 --MI;
1754
1755 return Changed;
1756}
1757
1758bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1759 SIAtomicScope Scope,
1760 SIAtomicAddrSpace AddrSpace,
1761 bool IsCrossAddrSpaceOrdering,
1762 Position Pos) const {
1763 bool Changed = false;
1764
1765 MachineBasicBlock &MBB = *MI->getParent();
1766 DebugLoc DL = MI->getDebugLoc();
1767
1768 if (Pos == Position::AFTER)
1769 ++MI;
1770
1771 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1772 switch (Scope) {
1773 case SIAtomicScope::SYSTEM:
1774 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1775 // hardware does not reorder memory operations by the same wave with
1776 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1777 // to initiate writeback of any dirty cache lines of earlier writes by the
1778 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1779 // writeback has completed.
1780 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1781 // Set SC bits to indicate system scope.
1783 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1784 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1785 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1786 Changed = true;
1787 break;
1788 case SIAtomicScope::AGENT:
1789 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1790 // Set SC bits to indicate agent scope.
1792
1793 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1794 // SIAtomicScope::AGENT, the following insertWait will generate the
1795 // required "S_WAITCNT vmcnt(0)".
1796 Changed = true;
1797 break;
1798 case SIAtomicScope::WORKGROUP:
1799 case SIAtomicScope::WAVEFRONT:
1800 case SIAtomicScope::SINGLETHREAD:
1801 // Do not generate "BUFFER_WBL2" as there are no caches it would
1802 // writeback, and would require an otherwise unnecessary
1803 // "S_WAITCNT vmcnt(0)".
1804 break;
1805 default:
1806 llvm_unreachable("Unsupported synchronization scope");
1807 }
1808 }
1809
1810 if (Pos == Position::AFTER)
1811 --MI;
1812
1813 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1814 // S_WAITCNT needed.
1815 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1816 IsCrossAddrSpaceOrdering, Pos);
1817
1818 return Changed;
1819}
1820
1821bool SIGfx10CacheControl::enableLoadCacheBypass(
1823 SIAtomicScope Scope,
1824 SIAtomicAddrSpace AddrSpace) const {
1825 assert(MI->mayLoad() && !MI->mayStore());
1826 bool Changed = false;
1827
1828 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1829 switch (Scope) {
1830 case SIAtomicScope::SYSTEM:
1831 case SIAtomicScope::AGENT:
1832 // Set the L0 and L1 cache policies to MISS_EVICT.
1833 // Note: there is no L2 cache coherent bypass control at the ISA level.
1834 Changed |= enableGLCBit(MI);
1835 Changed |= enableDLCBit(MI);
1836 break;
1837 case SIAtomicScope::WORKGROUP:
1838 // In WGP mode the waves of a work-group can be executing on either CU of
1839 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1840 // CU mode all waves of a work-group are on the same CU, and so the L0
1841 // does not need to be bypassed.
1842 if (!ST.isCuModeEnabled())
1843 Changed |= enableGLCBit(MI);
1844 break;
1845 case SIAtomicScope::WAVEFRONT:
1846 case SIAtomicScope::SINGLETHREAD:
1847 // No cache to bypass.
1848 break;
1849 default:
1850 llvm_unreachable("Unsupported synchronization scope");
1851 }
1852 }
1853
1854 /// The scratch address space does not need the global memory caches
1855 /// to be bypassed as all memory operations by the same thread are
1856 /// sequentially consistent, and no other thread can access scratch
1857 /// memory.
1858
1859 /// Other address spaces do not have a cache.
1860
1861 return Changed;
1862}
1863
1864bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1865 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1866 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1867
1868 // Only handle load and store, not atomic read-modify-write insructions. The
1869 // latter use glc to indicate if the atomic returns a result and so must not
1870 // be used for cache control.
1871 assert(MI->mayLoad() ^ MI->mayStore());
1872
1873 // Only update load and store, not LLVM IR atomic read-modify-write
1874 // instructions. The latter are always marked as volatile so cannot sensibly
1875 // handle it as do not want to pessimize all atomics. Also they do not support
1876 // the nontemporal attribute.
1877 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1878
1879 bool Changed = false;
1880
1881 if (IsVolatile) {
1882 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1883 // and MISS_LRU for store instructions.
1884 // Note: there is no L2 cache coherent bypass control at the ISA level.
1885 if (Op == SIMemOp::LOAD) {
1886 Changed |= enableGLCBit(MI);
1887 Changed |= enableDLCBit(MI);
1888 }
1889
1890 // Ensure operation has completed at system scope to cause all volatile
1891 // operations to be visible outside the program in a global order. Do not
1892 // request cross address space as only the global address space can be
1893 // observable outside the program, so no need to cause a waitcnt for LDS
1894 // address space operations.
1895 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1896 Position::AFTER);
1897 return Changed;
1898 }
1899
1900 if (IsNonTemporal) {
1901 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1902 // and L2 cache policy to STREAM.
1903 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1904 // to MISS_EVICT and the L2 cache policy to STREAM.
1905 if (Op == SIMemOp::STORE)
1906 Changed |= enableGLCBit(MI);
1907 Changed |= enableSLCBit(MI);
1908
1909 return Changed;
1910 }
1911
1912 return Changed;
1913}
1914
1915bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1916 SIAtomicScope Scope,
1917 SIAtomicAddrSpace AddrSpace,
1918 SIMemOp Op,
1919 bool IsCrossAddrSpaceOrdering,
1920 Position Pos) const {
1921 bool Changed = false;
1922
1923 MachineBasicBlock &MBB = *MI->getParent();
1924 DebugLoc DL = MI->getDebugLoc();
1925
1926 if (Pos == Position::AFTER)
1927 ++MI;
1928
1929 bool VMCnt = false;
1930 bool VSCnt = false;
1931 bool LGKMCnt = false;
1932
1933 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1934 SIAtomicAddrSpace::NONE) {
1935 switch (Scope) {
1936 case SIAtomicScope::SYSTEM:
1937 case SIAtomicScope::AGENT:
1938 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1939 VMCnt |= true;
1940 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1941 VSCnt |= true;
1942 break;
1943 case SIAtomicScope::WORKGROUP:
1944 // In WGP mode the waves of a work-group can be executing on either CU of
1945 // the WGP. Therefore need to wait for operations to complete to ensure
1946 // they are visible to waves in the other CU as the L0 is per CU.
1947 // Otherwise in CU mode and all waves of a work-group are on the same CU
1948 // which shares the same L0.
1949 if (!ST.isCuModeEnabled()) {
1950 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1951 VMCnt |= true;
1952 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1953 VSCnt |= true;
1954 }
1955 break;
1956 case SIAtomicScope::WAVEFRONT:
1957 case SIAtomicScope::SINGLETHREAD:
1958 // The L0 cache keeps all memory operations in order for
1959 // work-items in the same wavefront.
1960 break;
1961 default:
1962 llvm_unreachable("Unsupported synchronization scope");
1963 }
1964 }
1965
1966 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1967 switch (Scope) {
1968 case SIAtomicScope::SYSTEM:
1969 case SIAtomicScope::AGENT:
1970 case SIAtomicScope::WORKGROUP:
1971 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1972 // not needed as LDS operations for all waves are executed in a total
1973 // global ordering as observed by all waves. Required if also
1974 // synchronizing with global/GDS memory as LDS operations could be
1975 // reordered with respect to later global/GDS memory operations of the
1976 // same wave.
1977 LGKMCnt |= IsCrossAddrSpaceOrdering;
1978 break;
1979 case SIAtomicScope::WAVEFRONT:
1980 case SIAtomicScope::SINGLETHREAD:
1981 // The LDS keeps all memory operations in order for
1982 // the same wavefront.
1983 break;
1984 default:
1985 llvm_unreachable("Unsupported synchronization scope");
1986 }
1987 }
1988
1989 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1990 switch (Scope) {
1991 case SIAtomicScope::SYSTEM:
1992 case SIAtomicScope::AGENT:
1993 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1994 // is not needed as GDS operations for all waves are executed in a total
1995 // global ordering as observed by all waves. Required if also
1996 // synchronizing with global/LDS memory as GDS operations could be
1997 // reordered with respect to later global/LDS memory operations of the
1998 // same wave.
1999 LGKMCnt |= IsCrossAddrSpaceOrdering;
2000 break;
2001 case SIAtomicScope::WORKGROUP:
2002 case SIAtomicScope::WAVEFRONT:
2003 case SIAtomicScope::SINGLETHREAD:
2004 // The GDS keeps all memory operations in order for
2005 // the same work-group.
2006 break;
2007 default:
2008 llvm_unreachable("Unsupported synchronization scope");
2009 }
2010 }
2011
2012 if (VMCnt || LGKMCnt) {
2013 unsigned WaitCntImmediate =
2015 VMCnt ? 0 : getVmcntBitMask(IV),
2017 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2018 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2019 .addImm(WaitCntImmediate);
2020 Changed = true;
2021 }
2022
2023 if (VSCnt) {
2024 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2025 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2026 .addImm(0);
2027 Changed = true;
2028 }
2029
2030 if (Pos == Position::AFTER)
2031 --MI;
2032
2033 return Changed;
2034}
2035
2036bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2037 SIAtomicScope Scope,
2038 SIAtomicAddrSpace AddrSpace,
2039 Position Pos) const {
2040 if (!InsertCacheInv)
2041 return false;
2042
2043 bool Changed = false;
2044
2045 MachineBasicBlock &MBB = *MI->getParent();
2046 DebugLoc DL = MI->getDebugLoc();
2047
2048 if (Pos == Position::AFTER)
2049 ++MI;
2050
2051 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2052 switch (Scope) {
2053 case SIAtomicScope::SYSTEM:
2054 case SIAtomicScope::AGENT:
2055 // The order of invalidates matter here. We must invalidate "outer in"
2056 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2057 // invalidated.
2058 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2059 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2060 Changed = true;
2061 break;
2062 case SIAtomicScope::WORKGROUP:
2063 // In WGP mode the waves of a work-group can be executing on either CU of
2064 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2065 // in CU mode and all waves of a work-group are on the same CU, and so the
2066 // L0 does not need to be invalidated.
2067 if (!ST.isCuModeEnabled()) {
2068 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2069 Changed = true;
2070 }
2071 break;
2072 case SIAtomicScope::WAVEFRONT:
2073 case SIAtomicScope::SINGLETHREAD:
2074 // No cache to invalidate.
2075 break;
2076 default:
2077 llvm_unreachable("Unsupported synchronization scope");
2078 }
2079 }
2080
2081 /// The scratch address space does not need the global memory cache
2082 /// to be flushed as all memory operations by the same thread are
2083 /// sequentially consistent, and no other thread can access scratch
2084 /// memory.
2085
2086 /// Other address spaces do not have a cache.
2087
2088 if (Pos == Position::AFTER)
2089 --MI;
2090
2091 return Changed;
2092}
2093
2094bool SIGfx11CacheControl::enableLoadCacheBypass(
2095 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2096 SIAtomicAddrSpace AddrSpace) const {
2097 assert(MI->mayLoad() && !MI->mayStore());
2098 bool Changed = false;
2099
2100 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2101 switch (Scope) {
2102 case SIAtomicScope::SYSTEM:
2103 case SIAtomicScope::AGENT:
2104 // Set the L0 and L1 cache policies to MISS_EVICT.
2105 // Note: there is no L2 cache coherent bypass control at the ISA level.
2106 Changed |= enableGLCBit(MI);
2107 break;
2108 case SIAtomicScope::WORKGROUP:
2109 // In WGP mode the waves of a work-group can be executing on either CU of
2110 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2111 // CU mode all waves of a work-group are on the same CU, and so the L0
2112 // does not need to be bypassed.
2113 if (!ST.isCuModeEnabled())
2114 Changed |= enableGLCBit(MI);
2115 break;
2116 case SIAtomicScope::WAVEFRONT:
2117 case SIAtomicScope::SINGLETHREAD:
2118 // No cache to bypass.
2119 break;
2120 default:
2121 llvm_unreachable("Unsupported synchronization scope");
2122 }
2123 }
2124
2125 /// The scratch address space does not need the global memory caches
2126 /// to be bypassed as all memory operations by the same thread are
2127 /// sequentially consistent, and no other thread can access scratch
2128 /// memory.
2129
2130 /// Other address spaces do not have a cache.
2131
2132 return Changed;
2133}
2134
2135bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2136 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2137 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2138
2139 // Only handle load and store, not atomic read-modify-write insructions. The
2140 // latter use glc to indicate if the atomic returns a result and so must not
2141 // be used for cache control.
2142 assert(MI->mayLoad() ^ MI->mayStore());
2143
2144 // Only update load and store, not LLVM IR atomic read-modify-write
2145 // instructions. The latter are always marked as volatile so cannot sensibly
2146 // handle it as do not want to pessimize all atomics. Also they do not support
2147 // the nontemporal attribute.
2148 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2149
2150 bool Changed = false;
2151
2152 if (IsVolatile) {
2153 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2154 // and MISS_LRU for store instructions.
2155 // Note: there is no L2 cache coherent bypass control at the ISA level.
2156 if (Op == SIMemOp::LOAD)
2157 Changed |= enableGLCBit(MI);
2158
2159 // Set MALL NOALLOC for load and store instructions.
2160 Changed |= enableDLCBit(MI);
2161
2162 // Ensure operation has completed at system scope to cause all volatile
2163 // operations to be visible outside the program in a global order. Do not
2164 // request cross address space as only the global address space can be
2165 // observable outside the program, so no need to cause a waitcnt for LDS
2166 // address space operations.
2167 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2168 Position::AFTER);
2169 return Changed;
2170 }
2171
2172 if (IsNonTemporal) {
2173 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2174 // and L2 cache policy to STREAM.
2175 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2176 // to MISS_EVICT and the L2 cache policy to STREAM.
2177 if (Op == SIMemOp::STORE)
2178 Changed |= enableGLCBit(MI);
2179 Changed |= enableSLCBit(MI);
2180
2181 // Set MALL NOALLOC for load and store instructions.
2182 Changed |= enableDLCBit(MI);
2183 return Changed;
2184 }
2185
2186 return Changed;
2187}
2188
2189bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2190 AMDGPU::CPol::CPol Value) const {
2191 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2192 if (!CPol)
2193 return false;
2194
2196 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2197 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2198 return true;
2199 }
2200
2201 return false;
2202}
2203
2204bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2205 AMDGPU::CPol::CPol Value) const {
2206 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2207 if (!CPol)
2208 return false;
2209
2210 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2211 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2212 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2213 return true;
2214 }
2215
2216 return false;
2217}
2218
2219bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2220 const MachineBasicBlock::iterator MI) const {
2221 // TODO: implement flag for frontend to give us a hint not to insert waits.
2222
2223 MachineBasicBlock &MBB = *MI->getParent();
2224 const DebugLoc &DL = MI->getDebugLoc();
2225
2226 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2227 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2228 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2229 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2230 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2231
2232 return true;
2233}
2234
2235bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2236 SIAtomicScope Scope,
2237 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2238 bool IsCrossAddrSpaceOrdering,
2239 Position Pos) const {
2240 bool Changed = false;
2241
2242 MachineBasicBlock &MBB = *MI->getParent();
2243 DebugLoc DL = MI->getDebugLoc();
2244
2245 bool LOADCnt = false;
2246 bool DSCnt = false;
2247 bool STORECnt = false;
2248
2249 if (Pos == Position::AFTER)
2250 ++MI;
2251
2252 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2253 SIAtomicAddrSpace::NONE) {
2254 switch (Scope) {
2255 case SIAtomicScope::SYSTEM:
2256 case SIAtomicScope::AGENT:
2257 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2258 LOADCnt |= true;
2259 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2260 STORECnt |= true;
2261 break;
2262 case SIAtomicScope::WORKGROUP:
2263 // In WGP mode the waves of a work-group can be executing on either CU of
2264 // the WGP. Therefore need to wait for operations to complete to ensure
2265 // they are visible to waves in the other CU as the L0 is per CU.
2266 // Otherwise in CU mode and all waves of a work-group are on the same CU
2267 // which shares the same L0.
2268 if (!ST.isCuModeEnabled()) {
2269 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2270 LOADCnt |= true;
2271 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2272 STORECnt |= true;
2273 }
2274 break;
2275 case SIAtomicScope::WAVEFRONT:
2276 case SIAtomicScope::SINGLETHREAD:
2277 // The L0 cache keeps all memory operations in order for
2278 // work-items in the same wavefront.
2279 break;
2280 default:
2281 llvm_unreachable("Unsupported synchronization scope");
2282 }
2283 }
2284
2285 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2286 switch (Scope) {
2287 case SIAtomicScope::SYSTEM:
2288 case SIAtomicScope::AGENT:
2289 case SIAtomicScope::WORKGROUP:
2290 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2291 // not needed as LDS operations for all waves are executed in a total
2292 // global ordering as observed by all waves. Required if also
2293 // synchronizing with global/GDS memory as LDS operations could be
2294 // reordered with respect to later global/GDS memory operations of the
2295 // same wave.
2296 DSCnt |= IsCrossAddrSpaceOrdering;
2297 break;
2298 case SIAtomicScope::WAVEFRONT:
2299 case SIAtomicScope::SINGLETHREAD:
2300 // The LDS keeps all memory operations in order for
2301 // the same wavefront.
2302 break;
2303 default:
2304 llvm_unreachable("Unsupported synchronization scope");
2305 }
2306 }
2307
2308 if (LOADCnt) {
2309 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2310 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2311 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2312 Changed = true;
2313 }
2314
2315 if (STORECnt) {
2316 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2317 Changed = true;
2318 }
2319
2320 if (DSCnt) {
2321 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2322 Changed = true;
2323 }
2324
2325 if (Pos == Position::AFTER)
2326 --MI;
2327
2328 return Changed;
2329}
2330
2331bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2332 SIAtomicScope Scope,
2333 SIAtomicAddrSpace AddrSpace,
2334 Position Pos) const {
2335 if (!InsertCacheInv)
2336 return false;
2337
2338 MachineBasicBlock &MBB = *MI->getParent();
2339 DebugLoc DL = MI->getDebugLoc();
2340
2341 /// The scratch address space does not need the global memory cache
2342 /// to be flushed as all memory operations by the same thread are
2343 /// sequentially consistent, and no other thread can access scratch
2344 /// memory.
2345
2346 /// Other address spaces do not have a cache.
2347 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2348 return false;
2349
2351 switch (Scope) {
2352 case SIAtomicScope::SYSTEM:
2353 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2354 break;
2355 case SIAtomicScope::AGENT:
2356 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2357 break;
2358 case SIAtomicScope::WORKGROUP:
2359 // In WGP mode the waves of a work-group can be executing on either CU of
2360 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2361 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2362 // the L0 does not need to be invalidated.
2363 if (ST.isCuModeEnabled())
2364 return false;
2365
2366 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2367 break;
2368 case SIAtomicScope::WAVEFRONT:
2369 case SIAtomicScope::SINGLETHREAD:
2370 // No cache to invalidate.
2371 return false;
2372 default:
2373 llvm_unreachable("Unsupported synchronization scope");
2374 }
2375
2376 if (Pos == Position::AFTER)
2377 ++MI;
2378
2379 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2380
2381 if (Pos == Position::AFTER)
2382 --MI;
2383
2384 return true;
2385}
2386
2387bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2388 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2389 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2390
2391 // Only handle load and store, not atomic read-modify-write instructions.
2392 assert(MI->mayLoad() ^ MI->mayStore());
2393
2394 // Only update load and store, not LLVM IR atomic read-modify-write
2395 // instructions. The latter are always marked as volatile so cannot sensibly
2396 // handle it as do not want to pessimize all atomics. Also they do not support
2397 // the nontemporal attribute.
2398 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2399
2400 bool Changed = false;
2401
2402 if (IsLastUse) {
2403 // Set last-use hint.
2404 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2405 } else if (IsNonTemporal) {
2406 // Set non-temporal hint for all cache levels.
2407 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2408 }
2409
2410 if (IsVolatile) {
2411 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2412
2413 if (Op == SIMemOp::STORE)
2414 Changed |= insertWaitsBeforeSystemScopeStore(MI);
2415
2416 // Ensure operation has completed at system scope to cause all volatile
2417 // operations to be visible outside the program in a global order. Do not
2418 // request cross address space as only the global address space can be
2419 // observable outside the program, so no need to cause a waitcnt for LDS
2420 // address space operations.
2421 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2422 Position::AFTER);
2423 }
2424
2425 return Changed;
2426}
2427
2428bool SIGfx12CacheControl::expandSystemScopeStore(
2430 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2431 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2432 return insertWaitsBeforeSystemScopeStore(MI);
2433
2434 return false;
2435}
2436
2437bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2438 if (AtomicPseudoMIs.empty())
2439 return false;
2440
2441 for (auto &MI : AtomicPseudoMIs)
2442 MI->eraseFromParent();
2443
2444 AtomicPseudoMIs.clear();
2445 return true;
2446}
2447
2448bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2450 assert(MI->mayLoad() && !MI->mayStore());
2451
2452 bool Changed = false;
2453
2454 if (MOI.isAtomic()) {
2455 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2456 MOI.getOrdering() == AtomicOrdering::Acquire ||
2457 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2458 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2459 MOI.getOrderingAddrSpace());
2460 }
2461
2462 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2463 Changed |= CC->insertWait(MI, MOI.getScope(),
2464 MOI.getOrderingAddrSpace(),
2465 SIMemOp::LOAD | SIMemOp::STORE,
2466 MOI.getIsCrossAddressSpaceOrdering(),
2467 Position::BEFORE);
2468
2469 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2470 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2471 Changed |= CC->insertWait(MI, MOI.getScope(),
2472 MOI.getInstrAddrSpace(),
2473 SIMemOp::LOAD,
2474 MOI.getIsCrossAddressSpaceOrdering(),
2475 Position::AFTER);
2476 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2477 MOI.getOrderingAddrSpace(),
2478 Position::AFTER);
2479 }
2480
2481 return Changed;
2482 }
2483
2484 // Atomic instructions already bypass caches to the scope specified by the
2485 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2486 // instructions need additional treatment.
2487 Changed |= CC->enableVolatileAndOrNonTemporal(
2488 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2489 MOI.isNonTemporal(), MOI.isLastUse());
2490
2491 return Changed;
2492}
2493
2494bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2496 assert(!MI->mayLoad() && MI->mayStore());
2497
2498 bool Changed = false;
2499
2500 if (MOI.isAtomic()) {
2501 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2502 MOI.getOrdering() == AtomicOrdering::Release ||
2503 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2504 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2505 MOI.getOrderingAddrSpace());
2506 }
2507
2508 if (MOI.getOrdering() == AtomicOrdering::Release ||
2509 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2510 Changed |= CC->insertRelease(MI, MOI.getScope(),
2511 MOI.getOrderingAddrSpace(),
2512 MOI.getIsCrossAddressSpaceOrdering(),
2513 Position::BEFORE);
2514
2515 return Changed;
2516 }
2517
2518 // Atomic instructions already bypass caches to the scope specified by the
2519 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2520 // need additional treatment.
2521 Changed |= CC->enableVolatileAndOrNonTemporal(
2522 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2523 MOI.isNonTemporal());
2524
2525 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2526 // instruction field, do not confuse it with atomic scope.
2527 Changed |= CC->expandSystemScopeStore(MI);
2528 return Changed;
2529}
2530
2531bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2533 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2534
2535 AtomicPseudoMIs.push_back(MI);
2536 bool Changed = false;
2537
2538 if (MOI.isAtomic()) {
2539 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2540 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2541 SIMemOp::LOAD | SIMemOp::STORE,
2542 MOI.getIsCrossAddressSpaceOrdering(),
2543 Position::BEFORE);
2544
2545 if (MOI.getOrdering() == AtomicOrdering::Release ||
2546 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2547 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2548 /// TODO: This relies on a barrier always generating a waitcnt
2549 /// for LDS to ensure it is not reordered with the completion of
2550 /// the proceeding LDS operations. If barrier had a memory
2551 /// ordering and memory scope, then library does not need to
2552 /// generate a fence. Could add support in this file for
2553 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2554 /// adding S_WAITCNT before a S_BARRIER.
2555 Changed |= CC->insertRelease(MI, MOI.getScope(),
2556 MOI.getOrderingAddrSpace(),
2557 MOI.getIsCrossAddressSpaceOrdering(),
2558 Position::BEFORE);
2559
2560 // TODO: If both release and invalidate are happening they could be combined
2561 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2562 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2563 // track cache invalidate and write back instructions.
2564
2565 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2566 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2567 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2568 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2569 MOI.getOrderingAddrSpace(),
2570 Position::BEFORE);
2571
2572 return Changed;
2573 }
2574
2575 return Changed;
2576}
2577
2578bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2580 assert(MI->mayLoad() && MI->mayStore());
2581
2582 bool Changed = false;
2583
2584 if (MOI.isAtomic()) {
2585 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2586 MOI.getOrdering() == AtomicOrdering::Acquire ||
2587 MOI.getOrdering() == AtomicOrdering::Release ||
2588 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2589 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2590 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2591 MOI.getInstrAddrSpace());
2592 }
2593
2594 if (MOI.getOrdering() == AtomicOrdering::Release ||
2595 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2596 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2597 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2598 Changed |= CC->insertRelease(MI, MOI.getScope(),
2599 MOI.getOrderingAddrSpace(),
2600 MOI.getIsCrossAddressSpaceOrdering(),
2601 Position::BEFORE);
2602
2603 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2604 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2605 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2606 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2607 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2608 Changed |= CC->insertWait(MI, MOI.getScope(),
2609 MOI.getInstrAddrSpace(),
2610 isAtomicRet(*MI) ? SIMemOp::LOAD :
2611 SIMemOp::STORE,
2612 MOI.getIsCrossAddressSpaceOrdering(),
2613 Position::AFTER);
2614 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2615 MOI.getOrderingAddrSpace(),
2616 Position::AFTER);
2617 }
2618
2619 return Changed;
2620 }
2621
2622 return Changed;
2623}
2624
2625bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2626 bool Changed = false;
2627
2628 SIMemOpAccess MOA(MF);
2629 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2630
2631 for (auto &MBB : MF) {
2632 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2633
2634 // Unbundle instructions after the post-RA scheduler.
2635 if (MI->isBundle() && MI->mayLoadOrStore()) {
2636 MachineBasicBlock::instr_iterator II(MI->getIterator());
2638 I != E && I->isBundledWithPred(); ++I) {
2639 I->unbundleFromPred();
2640 for (MachineOperand &MO : I->operands())
2641 if (MO.isReg())
2642 MO.setIsInternalRead(false);
2643 }
2644
2645 MI->eraseFromParent();
2646 MI = II->getIterator();
2647 }
2648
2649 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2650 continue;
2651
2652 if (const auto &MOI = MOA.getLoadInfo(MI))
2653 Changed |= expandLoad(*MOI, MI);
2654 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2655 Changed |= expandStore(*MOI, MI);
2656 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2657 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2658 Changed |= expandAtomicFence(*MOI, MI);
2659 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2660 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2661 }
2662 }
2663
2664 Changed |= removeAtomicPseudoMIs();
2665 return Changed;
2666}
2667
2668INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2669
2670char SIMemoryLegalizer::ID = 0;
2671char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2672
2674 return new SIMemoryLegalizer();
2675}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
Atomic ordering constants.
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
#define DEBUG_TYPE
static SPIRV::Scope::Scope getScope(SyncScope::ID Ord, SPIRVMachineModuleInfo *MMI)
#define PASS_NAME
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineModuleInfo & getMMI() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
@ NONE
Definition: Attributor.h:6426
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
AtomicOrdering
Atomic ordering for LLVM's memory model.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
FunctionPass * createSIMemoryLegalizerPass()
Instruction set architecture version.
Definition: TargetParser.h:125