LLVM 18.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
26
27using namespace llvm;
28using namespace llvm::AMDGPU;
29
30#define DEBUG_TYPE "si-memory-legalizer"
31#define PASS_NAME "SI Memory Legalizer"
32
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37namespace {
38
40
41/// Memory operation flags. Can be ORed together.
42enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47};
48
49/// Position to insert a new instruction relative to an existing
50/// instruction.
51enum class Position {
52 BEFORE,
53 AFTER
54};
55
56/// The atomic synchronization scopes supported by the AMDGPU target.
57enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
61 WORKGROUP,
62 AGENT,
63 SYSTEM
64};
65
66/// The distinct address spaces supported by the AMDGPU target for
67/// atomic memory operation. Can be ORed together.
68enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86};
87
88class SIMemOpInfo final {
89private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101
102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106 bool IsCrossAddressSpaceOrdering = true,
107 AtomicOrdering FailureOrdering =
108 AtomicOrdering::SequentiallyConsistent,
109 bool IsVolatile = false,
110 bool IsNonTemporal = false)
111 : Ordering(Ordering), FailureOrdering(FailureOrdering),
112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113 InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
116 IsNonTemporal(IsNonTemporal) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SIMemOpInfo.
176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SIMemOpInfo.
182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210};
211
212class SIMemOpAccess final {
213private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 std::optional<SIMemOpInfo>
233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234
235public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241 std::optional<SIMemOpInfo>
243
244 /// \returns Store info if \p MI is a store operation, "std::nullopt"
245 /// otherwise.
246 std::optional<SIMemOpInfo>
247 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248
249 /// \returns Atomic fence info if \p MI is an atomic fence operation,
250 /// "std::nullopt" otherwise.
251 std::optional<SIMemOpInfo>
252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253
254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255 /// rmw operation, "std::nullopt" otherwise.
256 std::optional<SIMemOpInfo>
257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258};
259
260class SICacheControl {
261protected:
262
263 /// AMDGPU subtarget info.
264 const GCNSubtarget &ST;
265
266 /// Instruction info.
267 const SIInstrInfo *TII = nullptr;
268
269 IsaVersion IV;
270
271 /// Whether to insert cache invalidating instructions.
272 bool InsertCacheInv;
273
274 SICacheControl(const GCNSubtarget &ST);
275
276 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277 /// \returns Returns true if \p MI is modified, false otherwise.
278 bool enableNamedBit(const MachineBasicBlock::iterator MI,
279 AMDGPU::CPol::CPol Bit) const;
280
281public:
282
283 /// Create a cache control for the subtarget \p ST.
284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285
286 /// Update \p MI memory load instruction to bypass any caches up to
287 /// the \p Scope memory scope for address spaces \p
288 /// AddrSpace. Return true iff the instruction was modified.
289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290 SIAtomicScope Scope,
291 SIAtomicAddrSpace AddrSpace) const = 0;
292
293 /// Update \p MI memory store instruction to bypass any caches up to
294 /// the \p Scope memory scope for address spaces \p
295 /// AddrSpace. Return true iff the instruction was modified.
296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297 SIAtomicScope Scope,
298 SIAtomicAddrSpace AddrSpace) const = 0;
299
300 /// Update \p MI memory read-modify-write instruction to bypass any caches up
301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302 /// iff the instruction was modified.
303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304 SIAtomicScope Scope,
305 SIAtomicAddrSpace AddrSpace) const = 0;
306
307 /// Update \p MI memory instruction of kind \p Op associated with address
308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309 /// true iff the instruction was modified.
310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311 SIAtomicAddrSpace AddrSpace,
312 SIMemOp Op, bool IsVolatile,
313 bool IsNonTemporal) const = 0;
314
315 /// Inserts any necessary instructions at position \p Pos relative
316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317 /// \p Op associated with address spaces \p AddrSpace have completed. Used
318 /// between memory instructions to enforce the order they become visible as
319 /// observed by other memory instructions executing in memory scope \p Scope.
320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321 /// address spaces. Returns true iff any instructions inserted.
322 virtual bool insertWait(MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace,
325 SIMemOp Op,
326 bool IsCrossAddrSpaceOrdering,
327 Position Pos) const = 0;
328
329 /// Inserts any necessary instructions at position \p Pos relative to
330 /// instruction \p MI to ensure any subsequent memory instructions of this
331 /// thread with address spaces \p AddrSpace will observe the previous memory
332 /// operations by any thread for memory scopes up to memory scope \p Scope .
333 /// Returns true iff any instructions inserted.
334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335 SIAtomicScope Scope,
336 SIAtomicAddrSpace AddrSpace,
337 Position Pos) const = 0;
338
339 /// Inserts any necessary instructions at position \p Pos relative to
340 /// instruction \p MI to ensure previous memory instructions by this thread
341 /// with address spaces \p AddrSpace have completed and can be observed by
342 /// subsequent memory instructions by any thread executing in memory scope \p
343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344 /// between address spaces. Returns true iff any instructions inserted.
345 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 bool IsCrossAddrSpaceOrdering,
349 Position Pos) const = 0;
350
351 /// Virtual destructor to allow derivations to be deleted.
352 virtual ~SICacheControl() = default;
353
354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
356 return false;
357 }
358};
359
360class SIGfx6CacheControl : public SICacheControl {
361protected:
362
363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364 /// is modified, false otherwise.
365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366 return enableNamedBit(MI, AMDGPU::CPol::GLC);
367 }
368
369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370 /// is modified, false otherwise.
371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372 return enableNamedBit(MI, AMDGPU::CPol::SLC);
373 }
374
375public:
376
377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378
379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const override;
382
383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace) const override;
386
387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388 SIAtomicScope Scope,
389 SIAtomicAddrSpace AddrSpace) const override;
390
391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393 bool IsVolatile,
394 bool IsNonTemporal) const override;
395
396 bool insertWait(MachineBasicBlock::iterator &MI,
397 SIAtomicScope Scope,
398 SIAtomicAddrSpace AddrSpace,
399 SIMemOp Op,
400 bool IsCrossAddrSpaceOrdering,
401 Position Pos) const override;
402
403 bool insertAcquire(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 Position Pos) const override;
407
408 bool insertRelease(MachineBasicBlock::iterator &MI,
409 SIAtomicScope Scope,
410 SIAtomicAddrSpace AddrSpace,
411 bool IsCrossAddrSpaceOrdering,
412 Position Pos) const override;
413};
414
415class SIGfx7CacheControl : public SIGfx6CacheControl {
416public:
417
418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419
420 bool insertAcquire(MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace,
423 Position Pos) const override;
424
425};
426
427class SIGfx90ACacheControl : public SIGfx7CacheControl {
428public:
429
430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431
432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace) const override;
435
436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437 SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace) const override;
439
440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace) const override;
443
444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446 bool IsVolatile,
447 bool IsNonTemporal) const override;
448
449 bool insertWait(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 SIMemOp Op,
453 bool IsCrossAddrSpaceOrdering,
454 Position Pos) const override;
455
456 bool insertAcquire(MachineBasicBlock::iterator &MI,
457 SIAtomicScope Scope,
458 SIAtomicAddrSpace AddrSpace,
459 Position Pos) const override;
460
461 bool insertRelease(MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace,
464 bool IsCrossAddrSpaceOrdering,
465 Position Pos) const override;
466};
467
468class SIGfx940CacheControl : public SIGfx90ACacheControl {
469protected:
470
471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472 /// is modified, false otherwise.
473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474 return enableNamedBit(MI, AMDGPU::CPol::SC0);
475 }
476
477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478 /// is modified, false otherwise.
479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480 return enableNamedBit(MI, AMDGPU::CPol::SC1);
481 }
482
483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484 /// is modified, false otherwise.
485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486 return enableNamedBit(MI, AMDGPU::CPol::NT);
487 }
488
489public:
490
491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492
493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494 SIAtomicScope Scope,
495 SIAtomicAddrSpace AddrSpace) const override;
496
497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace) const override;
500
501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502 SIAtomicScope Scope,
503 SIAtomicAddrSpace AddrSpace) const override;
504
505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507 bool IsVolatile,
508 bool IsNonTemporal) const override;
509
510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512
513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515 Position Pos) const override;
516
517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518 MachineBasicBlock::iterator &MI) const override {
519 bool Changed = false;
520 if (ST.hasForceStoreSC0SC1() &&
521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522 SIAtomicAddrSpace::GLOBAL |
523 SIAtomicAddrSpace::OTHER)) !=
524 SIAtomicAddrSpace::NONE) {
525 Changed |= enableSC0Bit(MI);
526 Changed |= enableSC1Bit(MI);
527 }
528 return Changed;
529 }
530};
531
532class SIGfx10CacheControl : public SIGfx7CacheControl {
533protected:
534
535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536 /// is modified, false otherwise.
537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538 return enableNamedBit(MI, AMDGPU::CPol::DLC);
539 }
540
541public:
542
543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544
545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546 SIAtomicScope Scope,
547 SIAtomicAddrSpace AddrSpace) const override;
548
549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551 bool IsVolatile,
552 bool IsNonTemporal) const override;
553
554 bool insertWait(MachineBasicBlock::iterator &MI,
555 SIAtomicScope Scope,
556 SIAtomicAddrSpace AddrSpace,
557 SIMemOp Op,
558 bool IsCrossAddrSpaceOrdering,
559 Position Pos) const override;
560
561 bool insertAcquire(MachineBasicBlock::iterator &MI,
562 SIAtomicScope Scope,
563 SIAtomicAddrSpace AddrSpace,
564 Position Pos) const override;
565};
566
567class SIGfx11CacheControl : public SIGfx10CacheControl {
568public:
569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570
571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572 SIAtomicScope Scope,
573 SIAtomicAddrSpace AddrSpace) const override;
574
575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577 bool IsVolatile,
578 bool IsNonTemporal) const override;
579};
580
581class SIMemoryLegalizer final : public MachineFunctionPass {
582private:
583
584 /// Cache Control.
585 std::unique_ptr<SICacheControl> CC = nullptr;
586
587 /// List of atomic pseudo instructions.
588 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
589
590 /// Return true iff instruction \p MI is a atomic instruction that
591 /// returns a result.
592 bool isAtomicRet(const MachineInstr &MI) const {
594 }
595
596 /// Removes all processed atomic pseudo instructions from the current
597 /// function. Returns true if current function is modified, false otherwise.
598 bool removeAtomicPseudoMIs();
599
600 /// Expands load operation \p MI. Returns true if instructions are
601 /// added/deleted or \p MI is modified, false otherwise.
602 bool expandLoad(const SIMemOpInfo &MOI,
604 /// Expands store operation \p MI. Returns true if instructions are
605 /// added/deleted or \p MI is modified, false otherwise.
606 bool expandStore(const SIMemOpInfo &MOI,
608 /// Expands atomic fence operation \p MI. Returns true if
609 /// instructions are added/deleted or \p MI is modified, false otherwise.
610 bool expandAtomicFence(const SIMemOpInfo &MOI,
612 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
613 /// instructions are added/deleted or \p MI is modified, false otherwise.
614 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
616
617public:
618 static char ID;
619
620 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
621
622 void getAnalysisUsage(AnalysisUsage &AU) const override {
623 AU.setPreservesCFG();
625 }
626
627 StringRef getPassName() const override {
628 return PASS_NAME;
629 }
630
631 bool runOnMachineFunction(MachineFunction &MF) override;
632};
633
634} // end namespace anonymous
635
636void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
637 const char *Msg) const {
638 const Function &Func = MI->getParent()->getParent()->getFunction();
639 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
640 Func.getContext().diagnose(Diag);
641}
642
643std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
644SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
645 SIAtomicAddrSpace InstrAddrSpace) const {
646 if (SSID == SyncScope::System)
647 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
648 if (SSID == MMI->getAgentSSID())
649 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
650 if (SSID == MMI->getWorkgroupSSID())
651 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
652 true);
653 if (SSID == MMI->getWavefrontSSID())
654 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
655 true);
656 if (SSID == SyncScope::SingleThread)
657 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
658 true);
659 if (SSID == MMI->getSystemOneAddressSpaceSSID())
660 return std::tuple(SIAtomicScope::SYSTEM,
661 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
662 if (SSID == MMI->getAgentOneAddressSpaceSSID())
663 return std::tuple(SIAtomicScope::AGENT,
664 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
665 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
666 return std::tuple(SIAtomicScope::WORKGROUP,
667 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
668 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
669 return std::tuple(SIAtomicScope::WAVEFRONT,
670 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
671 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
672 return std::tuple(SIAtomicScope::SINGLETHREAD,
673 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
674 return std::nullopt;
675}
676
677SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
678 if (AS == AMDGPUAS::FLAT_ADDRESS)
679 return SIAtomicAddrSpace::FLAT;
680 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
681 return SIAtomicAddrSpace::GLOBAL;
682 if (AS == AMDGPUAS::LOCAL_ADDRESS)
683 return SIAtomicAddrSpace::LDS;
685 return SIAtomicAddrSpace::SCRATCH;
686 if (AS == AMDGPUAS::REGION_ADDRESS)
687 return SIAtomicAddrSpace::GDS;
688
689 return SIAtomicAddrSpace::OTHER;
690}
691
692SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
694}
695
696std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
697 const MachineBasicBlock::iterator &MI) const {
698 assert(MI->getNumMemOperands() > 0);
699
701 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
702 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
703 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
704 bool IsNonTemporal = true;
705 bool IsVolatile = false;
706
707 // Validator should check whether or not MMOs cover the entire set of
708 // locations accessed by the memory instruction.
709 for (const auto &MMO : MI->memoperands()) {
710 IsNonTemporal &= MMO->isNonTemporal();
711 IsVolatile |= MMO->isVolatile();
712 InstrAddrSpace |=
713 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
714 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
715 if (OpOrdering != AtomicOrdering::NotAtomic) {
716 const auto &IsSyncScopeInclusion =
717 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
718 if (!IsSyncScopeInclusion) {
719 reportUnsupported(MI,
720 "Unsupported non-inclusive atomic synchronization scope");
721 return std::nullopt;
722 }
723
724 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
725 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
726 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
727 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
728 FailureOrdering =
729 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
730 }
731 }
732
733 SIAtomicScope Scope = SIAtomicScope::NONE;
734 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
735 bool IsCrossAddressSpaceOrdering = false;
736 if (Ordering != AtomicOrdering::NotAtomic) {
737 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
738 if (!ScopeOrNone) {
739 reportUnsupported(MI, "Unsupported atomic synchronization scope");
740 return std::nullopt;
741 }
742 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
743 *ScopeOrNone;
744 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
745 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
746 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
747 reportUnsupported(MI, "Unsupported atomic address space");
748 return std::nullopt;
749 }
750 }
751 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
752 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
753 IsNonTemporal);
754}
755
756std::optional<SIMemOpInfo>
757SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
758 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
759
760 if (!(MI->mayLoad() && !MI->mayStore()))
761 return std::nullopt;
762
763 // Be conservative if there are no memory operands.
764 if (MI->getNumMemOperands() == 0)
765 return SIMemOpInfo();
766
767 return constructFromMIWithMMO(MI);
768}
769
770std::optional<SIMemOpInfo>
771SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
772 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
773
774 if (!(!MI->mayLoad() && MI->mayStore()))
775 return std::nullopt;
776
777 // Be conservative if there are no memory operands.
778 if (MI->getNumMemOperands() == 0)
779 return SIMemOpInfo();
780
781 return constructFromMIWithMMO(MI);
782}
783
784std::optional<SIMemOpInfo>
785SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
786 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
787
788 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
789 return std::nullopt;
790
791 AtomicOrdering Ordering =
792 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
793
794 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
795 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
796 if (!ScopeOrNone) {
797 reportUnsupported(MI, "Unsupported atomic synchronization scope");
798 return std::nullopt;
799 }
800
801 SIAtomicScope Scope = SIAtomicScope::NONE;
802 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
803 bool IsCrossAddressSpaceOrdering = false;
804 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
805 *ScopeOrNone;
806
807 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
808 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
809 reportUnsupported(MI, "Unsupported atomic address space");
810 return std::nullopt;
811 }
812
813 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
814 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
815}
816
817std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
818 const MachineBasicBlock::iterator &MI) const {
819 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
820
821 if (!(MI->mayLoad() && MI->mayStore()))
822 return std::nullopt;
823
824 // Be conservative if there are no memory operands.
825 if (MI->getNumMemOperands() == 0)
826 return SIMemOpInfo();
827
828 return constructFromMIWithMMO(MI);
829}
830
831SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
832 TII = ST.getInstrInfo();
833 IV = getIsaVersion(ST.getCPU());
834 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
835}
836
837bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
838 AMDGPU::CPol::CPol Bit) const {
839 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
840 if (!CPol)
841 return false;
842
843 CPol->setImm(CPol->getImm() | Bit);
844 return true;
845}
846
847/* static */
848std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
849 GCNSubtarget::Generation Generation = ST.getGeneration();
850 if (ST.hasGFX940Insts())
851 return std::make_unique<SIGfx940CacheControl>(ST);
852 if (ST.hasGFX90AInsts())
853 return std::make_unique<SIGfx90ACacheControl>(ST);
854 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
855 return std::make_unique<SIGfx6CacheControl>(ST);
856 if (Generation < AMDGPUSubtarget::GFX10)
857 return std::make_unique<SIGfx7CacheControl>(ST);
858 if (Generation < AMDGPUSubtarget::GFX11)
859 return std::make_unique<SIGfx10CacheControl>(ST);
860 return std::make_unique<SIGfx11CacheControl>(ST);
861}
862
863bool SIGfx6CacheControl::enableLoadCacheBypass(
865 SIAtomicScope Scope,
866 SIAtomicAddrSpace AddrSpace) const {
867 assert(MI->mayLoad() && !MI->mayStore());
868 bool Changed = false;
869
870 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
871 switch (Scope) {
872 case SIAtomicScope::SYSTEM:
873 case SIAtomicScope::AGENT:
874 // Set L1 cache policy to MISS_EVICT.
875 // Note: there is no L2 cache bypass policy at the ISA level.
876 Changed |= enableGLCBit(MI);
877 break;
878 case SIAtomicScope::WORKGROUP:
879 case SIAtomicScope::WAVEFRONT:
880 case SIAtomicScope::SINGLETHREAD:
881 // No cache to bypass.
882 break;
883 default:
884 llvm_unreachable("Unsupported synchronization scope");
885 }
886 }
887
888 /// The scratch address space does not need the global memory caches
889 /// to be bypassed as all memory operations by the same thread are
890 /// sequentially consistent, and no other thread can access scratch
891 /// memory.
892
893 /// Other address spaces do not have a cache.
894
895 return Changed;
896}
897
898bool SIGfx6CacheControl::enableStoreCacheBypass(
900 SIAtomicScope Scope,
901 SIAtomicAddrSpace AddrSpace) const {
902 assert(!MI->mayLoad() && MI->mayStore());
903 bool Changed = false;
904
905 /// The L1 cache is write through so does not need to be bypassed. There is no
906 /// bypass control for the L2 cache at the isa level.
907
908 return Changed;
909}
910
911bool SIGfx6CacheControl::enableRMWCacheBypass(
913 SIAtomicScope Scope,
914 SIAtomicAddrSpace AddrSpace) const {
915 assert(MI->mayLoad() && MI->mayStore());
916 bool Changed = false;
917
918 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
919 /// bypassed, and the GLC bit is instead used to indicate if they are
920 /// return or no-return.
921 /// Note: there is no L2 cache coherent bypass control at the ISA level.
922
923 return Changed;
924}
925
926bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
927 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
928 bool IsVolatile, bool IsNonTemporal) const {
929 // Only handle load and store, not atomic read-modify-write insructions. The
930 // latter use glc to indicate if the atomic returns a result and so must not
931 // be used for cache control.
932 assert(MI->mayLoad() ^ MI->mayStore());
933
934 // Only update load and store, not LLVM IR atomic read-modify-write
935 // instructions. The latter are always marked as volatile so cannot sensibly
936 // handle it as do not want to pessimize all atomics. Also they do not support
937 // the nontemporal attribute.
938 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
939
940 bool Changed = false;
941
942 if (IsVolatile) {
943 // Set L1 cache policy to be MISS_EVICT for load instructions
944 // and MISS_LRU for store instructions.
945 // Note: there is no L2 cache bypass policy at the ISA level.
946 if (Op == SIMemOp::LOAD)
947 Changed |= enableGLCBit(MI);
948
949 // Ensure operation has completed at system scope to cause all volatile
950 // operations to be visible outside the program in a global order. Do not
951 // request cross address space as only the global address space can be
952 // observable outside the program, so no need to cause a waitcnt for LDS
953 // address space operations.
954 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
955 Position::AFTER);
956
957 return Changed;
958 }
959
960 if (IsNonTemporal) {
961 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
962 // for both loads and stores, and the L2 cache policy to STREAM.
963 Changed |= enableGLCBit(MI);
964 Changed |= enableSLCBit(MI);
965 return Changed;
966 }
967
968 return Changed;
969}
970
971bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
972 SIAtomicScope Scope,
973 SIAtomicAddrSpace AddrSpace,
974 SIMemOp Op,
975 bool IsCrossAddrSpaceOrdering,
976 Position Pos) const {
977 bool Changed = false;
978
979 MachineBasicBlock &MBB = *MI->getParent();
980 DebugLoc DL = MI->getDebugLoc();
981
982 if (Pos == Position::AFTER)
983 ++MI;
984
985 bool VMCnt = false;
986 bool LGKMCnt = false;
987
988 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
989 SIAtomicAddrSpace::NONE) {
990 switch (Scope) {
991 case SIAtomicScope::SYSTEM:
992 case SIAtomicScope::AGENT:
993 VMCnt |= true;
994 break;
995 case SIAtomicScope::WORKGROUP:
996 case SIAtomicScope::WAVEFRONT:
997 case SIAtomicScope::SINGLETHREAD:
998 // The L1 cache keeps all memory operations in order for
999 // wavefronts in the same work-group.
1000 break;
1001 default:
1002 llvm_unreachable("Unsupported synchronization scope");
1003 }
1004 }
1005
1006 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1007 switch (Scope) {
1008 case SIAtomicScope::SYSTEM:
1009 case SIAtomicScope::AGENT:
1010 case SIAtomicScope::WORKGROUP:
1011 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1012 // not needed as LDS operations for all waves are executed in a total
1013 // global ordering as observed by all waves. Required if also
1014 // synchronizing with global/GDS memory as LDS operations could be
1015 // reordered with respect to later global/GDS memory operations of the
1016 // same wave.
1017 LGKMCnt |= IsCrossAddrSpaceOrdering;
1018 break;
1019 case SIAtomicScope::WAVEFRONT:
1020 case SIAtomicScope::SINGLETHREAD:
1021 // The LDS keeps all memory operations in order for
1022 // the same wavefront.
1023 break;
1024 default:
1025 llvm_unreachable("Unsupported synchronization scope");
1026 }
1027 }
1028
1029 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1030 switch (Scope) {
1031 case SIAtomicScope::SYSTEM:
1032 case SIAtomicScope::AGENT:
1033 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1034 // is not needed as GDS operations for all waves are executed in a total
1035 // global ordering as observed by all waves. Required if also
1036 // synchronizing with global/LDS memory as GDS operations could be
1037 // reordered with respect to later global/LDS memory operations of the
1038 // same wave.
1039 LGKMCnt |= IsCrossAddrSpaceOrdering;
1040 break;
1041 case SIAtomicScope::WORKGROUP:
1042 case SIAtomicScope::WAVEFRONT:
1043 case SIAtomicScope::SINGLETHREAD:
1044 // The GDS keeps all memory operations in order for
1045 // the same work-group.
1046 break;
1047 default:
1048 llvm_unreachable("Unsupported synchronization scope");
1049 }
1050 }
1051
1052 if (VMCnt || LGKMCnt) {
1053 unsigned WaitCntImmediate =
1055 VMCnt ? 0 : getVmcntBitMask(IV),
1057 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1058 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1059 Changed = true;
1060 }
1061
1062 if (Pos == Position::AFTER)
1063 --MI;
1064
1065 return Changed;
1066}
1067
1068bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1069 SIAtomicScope Scope,
1070 SIAtomicAddrSpace AddrSpace,
1071 Position Pos) const {
1072 if (!InsertCacheInv)
1073 return false;
1074
1075 bool Changed = false;
1076
1077 MachineBasicBlock &MBB = *MI->getParent();
1078 DebugLoc DL = MI->getDebugLoc();
1079
1080 if (Pos == Position::AFTER)
1081 ++MI;
1082
1083 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1084 switch (Scope) {
1085 case SIAtomicScope::SYSTEM:
1086 case SIAtomicScope::AGENT:
1087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1088 Changed = true;
1089 break;
1090 case SIAtomicScope::WORKGROUP:
1091 case SIAtomicScope::WAVEFRONT:
1092 case SIAtomicScope::SINGLETHREAD:
1093 // No cache to invalidate.
1094 break;
1095 default:
1096 llvm_unreachable("Unsupported synchronization scope");
1097 }
1098 }
1099
1100 /// The scratch address space does not need the global memory cache
1101 /// to be flushed as all memory operations by the same thread are
1102 /// sequentially consistent, and no other thread can access scratch
1103 /// memory.
1104
1105 /// Other address spaces do not have a cache.
1106
1107 if (Pos == Position::AFTER)
1108 --MI;
1109
1110 return Changed;
1111}
1112
1113bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1114 SIAtomicScope Scope,
1115 SIAtomicAddrSpace AddrSpace,
1116 bool IsCrossAddrSpaceOrdering,
1117 Position Pos) const {
1118 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1119 IsCrossAddrSpaceOrdering, Pos);
1120}
1121
1122bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1123 SIAtomicScope Scope,
1124 SIAtomicAddrSpace AddrSpace,
1125 Position Pos) const {
1126 if (!InsertCacheInv)
1127 return false;
1128
1129 bool Changed = false;
1130
1131 MachineBasicBlock &MBB = *MI->getParent();
1132 DebugLoc DL = MI->getDebugLoc();
1133
1135
1136 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1137 ? AMDGPU::BUFFER_WBINVL1
1138 : AMDGPU::BUFFER_WBINVL1_VOL;
1139
1140 if (Pos == Position::AFTER)
1141 ++MI;
1142
1143 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1144 switch (Scope) {
1145 case SIAtomicScope::SYSTEM:
1146 case SIAtomicScope::AGENT:
1147 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1148 Changed = true;
1149 break;
1150 case SIAtomicScope::WORKGROUP:
1151 case SIAtomicScope::WAVEFRONT:
1152 case SIAtomicScope::SINGLETHREAD:
1153 // No cache to invalidate.
1154 break;
1155 default:
1156 llvm_unreachable("Unsupported synchronization scope");
1157 }
1158 }
1159
1160 /// The scratch address space does not need the global memory cache
1161 /// to be flushed as all memory operations by the same thread are
1162 /// sequentially consistent, and no other thread can access scratch
1163 /// memory.
1164
1165 /// Other address spaces do not have a cache.
1166
1167 if (Pos == Position::AFTER)
1168 --MI;
1169
1170 return Changed;
1171}
1172
1173bool SIGfx90ACacheControl::enableLoadCacheBypass(
1175 SIAtomicScope Scope,
1176 SIAtomicAddrSpace AddrSpace) const {
1177 assert(MI->mayLoad() && !MI->mayStore());
1178 bool Changed = false;
1179
1180 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1181 switch (Scope) {
1182 case SIAtomicScope::SYSTEM:
1183 case SIAtomicScope::AGENT:
1184 // Set the L1 cache policy to MISS_LRU.
1185 // Note: there is no L2 cache bypass policy at the ISA level.
1186 Changed |= enableGLCBit(MI);
1187 break;
1188 case SIAtomicScope::WORKGROUP:
1189 // In threadgroup split mode the waves of a work-group can be executing on
1190 // different CUs. Therefore need to bypass the L1 which is per CU.
1191 // Otherwise in non-threadgroup split mode all waves of a work-group are
1192 // on the same CU, and so the L1 does not need to be bypassed.
1193 if (ST.isTgSplitEnabled())
1194 Changed |= enableGLCBit(MI);
1195 break;
1196 case SIAtomicScope::WAVEFRONT:
1197 case SIAtomicScope::SINGLETHREAD:
1198 // No cache to bypass.
1199 break;
1200 default:
1201 llvm_unreachable("Unsupported synchronization scope");
1202 }
1203 }
1204
1205 /// The scratch address space does not need the global memory caches
1206 /// to be bypassed as all memory operations by the same thread are
1207 /// sequentially consistent, and no other thread can access scratch
1208 /// memory.
1209
1210 /// Other address spaces do not have a cache.
1211
1212 return Changed;
1213}
1214
1215bool SIGfx90ACacheControl::enableStoreCacheBypass(
1217 SIAtomicScope Scope,
1218 SIAtomicAddrSpace AddrSpace) const {
1219 assert(!MI->mayLoad() && MI->mayStore());
1220 bool Changed = false;
1221
1222 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1223 switch (Scope) {
1224 case SIAtomicScope::SYSTEM:
1225 case SIAtomicScope::AGENT:
1226 /// Do not set glc for store atomic operations as they implicitly write
1227 /// through the L1 cache.
1228 break;
1229 case SIAtomicScope::WORKGROUP:
1230 case SIAtomicScope::WAVEFRONT:
1231 case SIAtomicScope::SINGLETHREAD:
1232 // No cache to bypass. Store atomics implicitly write through the L1
1233 // cache.
1234 break;
1235 default:
1236 llvm_unreachable("Unsupported synchronization scope");
1237 }
1238 }
1239
1240 /// The scratch address space does not need the global memory caches
1241 /// to be bypassed as all memory operations by the same thread are
1242 /// sequentially consistent, and no other thread can access scratch
1243 /// memory.
1244
1245 /// Other address spaces do not have a cache.
1246
1247 return Changed;
1248}
1249
1250bool SIGfx90ACacheControl::enableRMWCacheBypass(
1252 SIAtomicScope Scope,
1253 SIAtomicAddrSpace AddrSpace) const {
1254 assert(MI->mayLoad() && MI->mayStore());
1255 bool Changed = false;
1256
1257 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1258 switch (Scope) {
1259 case SIAtomicScope::SYSTEM:
1260 case SIAtomicScope::AGENT:
1261 /// Do not set glc for RMW atomic operations as they implicitly bypass
1262 /// the L1 cache, and the glc bit is instead used to indicate if they are
1263 /// return or no-return.
1264 break;
1265 case SIAtomicScope::WORKGROUP:
1266 case SIAtomicScope::WAVEFRONT:
1267 case SIAtomicScope::SINGLETHREAD:
1268 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1269 break;
1270 default:
1271 llvm_unreachable("Unsupported synchronization scope");
1272 }
1273 }
1274
1275 return Changed;
1276}
1277
1278bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1279 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1280 bool IsVolatile, bool IsNonTemporal) const {
1281 // Only handle load and store, not atomic read-modify-write insructions. The
1282 // latter use glc to indicate if the atomic returns a result and so must not
1283 // be used for cache control.
1284 assert(MI->mayLoad() ^ MI->mayStore());
1285
1286 // Only update load and store, not LLVM IR atomic read-modify-write
1287 // instructions. The latter are always marked as volatile so cannot sensibly
1288 // handle it as do not want to pessimize all atomics. Also they do not support
1289 // the nontemporal attribute.
1290 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1291
1292 bool Changed = false;
1293
1294 if (IsVolatile) {
1295 // Set L1 cache policy to be MISS_EVICT for load instructions
1296 // and MISS_LRU for store instructions.
1297 // Note: there is no L2 cache bypass policy at the ISA level.
1298 if (Op == SIMemOp::LOAD)
1299 Changed |= enableGLCBit(MI);
1300
1301 // Ensure operation has completed at system scope to cause all volatile
1302 // operations to be visible outside the program in a global order. Do not
1303 // request cross address space as only the global address space can be
1304 // observable outside the program, so no need to cause a waitcnt for LDS
1305 // address space operations.
1306 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1307 Position::AFTER);
1308
1309 return Changed;
1310 }
1311
1312 if (IsNonTemporal) {
1313 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1314 // for both loads and stores, and the L2 cache policy to STREAM.
1315 Changed |= enableGLCBit(MI);
1316 Changed |= enableSLCBit(MI);
1317 return Changed;
1318 }
1319
1320 return Changed;
1321}
1322
1323bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1324 SIAtomicScope Scope,
1325 SIAtomicAddrSpace AddrSpace,
1326 SIMemOp Op,
1327 bool IsCrossAddrSpaceOrdering,
1328 Position Pos) const {
1329 if (ST.isTgSplitEnabled()) {
1330 // In threadgroup split mode the waves of a work-group can be executing on
1331 // different CUs. Therefore need to wait for global or GDS memory operations
1332 // to complete to ensure they are visible to waves in the other CUs.
1333 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1334 // the same CU, so no need to wait for global memory as all waves in the
1335 // work-group access the same the L1, nor wait for GDS as access are ordered
1336 // on a CU.
1337 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1338 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1339 (Scope == SIAtomicScope::WORKGROUP)) {
1340 // Same as GFX7 using agent scope.
1341 Scope = SIAtomicScope::AGENT;
1342 }
1343 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1344 // LDS memory operations.
1345 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1346 }
1347 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1348 IsCrossAddrSpaceOrdering, Pos);
1349}
1350
1351bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1352 SIAtomicScope Scope,
1353 SIAtomicAddrSpace AddrSpace,
1354 Position Pos) const {
1355 if (!InsertCacheInv)
1356 return false;
1357
1358 bool Changed = false;
1359
1360 MachineBasicBlock &MBB = *MI->getParent();
1361 DebugLoc DL = MI->getDebugLoc();
1362
1363 if (Pos == Position::AFTER)
1364 ++MI;
1365
1366 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1367 switch (Scope) {
1368 case SIAtomicScope::SYSTEM:
1369 // Ensures that following loads will not see stale remote VMEM data or
1370 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1371 // CC will never be stale due to the local memory probes.
1372 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1373 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1374 // hardware does not reorder memory operations by the same wave with
1375 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1376 // remove any cache lines of earlier writes by the same wave and ensures
1377 // later reads by the same wave will refetch the cache lines.
1378 Changed = true;
1379 break;
1380 case SIAtomicScope::AGENT:
1381 // Same as GFX7.
1382 break;
1383 case SIAtomicScope::WORKGROUP:
1384 // In threadgroup split mode the waves of a work-group can be executing on
1385 // different CUs. Therefore need to invalidate the L1 which is per CU.
1386 // Otherwise in non-threadgroup split mode all waves of a work-group are
1387 // on the same CU, and so the L1 does not need to be invalidated.
1388 if (ST.isTgSplitEnabled()) {
1389 // Same as GFX7 using agent scope.
1390 Scope = SIAtomicScope::AGENT;
1391 }
1392 break;
1393 case SIAtomicScope::WAVEFRONT:
1394 case SIAtomicScope::SINGLETHREAD:
1395 // Same as GFX7.
1396 break;
1397 default:
1398 llvm_unreachable("Unsupported synchronization scope");
1399 }
1400 }
1401
1402 /// The scratch address space does not need the global memory cache
1403 /// to be flushed as all memory operations by the same thread are
1404 /// sequentially consistent, and no other thread can access scratch
1405 /// memory.
1406
1407 /// Other address spaces do not have a cache.
1408
1409 if (Pos == Position::AFTER)
1410 --MI;
1411
1412 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1413
1414 return Changed;
1415}
1416
1417bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1418 SIAtomicScope Scope,
1419 SIAtomicAddrSpace AddrSpace,
1420 bool IsCrossAddrSpaceOrdering,
1421 Position Pos) const {
1422 bool Changed = false;
1423
1424 MachineBasicBlock &MBB = *MI->getParent();
1425 DebugLoc DL = MI->getDebugLoc();
1426
1427 if (Pos == Position::AFTER)
1428 ++MI;
1429
1430 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1431 switch (Scope) {
1432 case SIAtomicScope::SYSTEM:
1433 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1434 // hardware does not reorder memory operations by the same wave with
1435 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1436 // to initiate writeback of any dirty cache lines of earlier writes by the
1437 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1438 // writeback has completed.
1439 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1440 // Set SC bits to indicate system scope.
1442 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1443 // vmcnt(0)" needed by the "BUFFER_WBL2".
1444 Changed = true;
1445 break;
1446 case SIAtomicScope::AGENT:
1447 case SIAtomicScope::WORKGROUP:
1448 case SIAtomicScope::WAVEFRONT:
1449 case SIAtomicScope::SINGLETHREAD:
1450 // Same as GFX7.
1451 break;
1452 default:
1453 llvm_unreachable("Unsupported synchronization scope");
1454 }
1455 }
1456
1457 if (Pos == Position::AFTER)
1458 --MI;
1459
1460 Changed |=
1461 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1462 IsCrossAddrSpaceOrdering, Pos);
1463
1464 return Changed;
1465}
1466
1467bool SIGfx940CacheControl::enableLoadCacheBypass(
1468 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1469 SIAtomicAddrSpace AddrSpace) const {
1470 assert(MI->mayLoad() && !MI->mayStore());
1471 bool Changed = false;
1472
1473 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1474 switch (Scope) {
1475 case SIAtomicScope::SYSTEM:
1476 // Set SC bits to indicate system scope.
1477 Changed |= enableSC0Bit(MI);
1478 Changed |= enableSC1Bit(MI);
1479 break;
1480 case SIAtomicScope::AGENT:
1481 // Set SC bits to indicate agent scope.
1482 Changed |= enableSC1Bit(MI);
1483 break;
1484 case SIAtomicScope::WORKGROUP:
1485 // In threadgroup split mode the waves of a work-group can be executing on
1486 // different CUs. Therefore need to bypass the L1 which is per CU.
1487 // Otherwise in non-threadgroup split mode all waves of a work-group are
1488 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1489 // bits to indicate work-group scope will do this automatically.
1490 Changed |= enableSC0Bit(MI);
1491 break;
1492 case SIAtomicScope::WAVEFRONT:
1493 case SIAtomicScope::SINGLETHREAD:
1494 // Leave SC bits unset to indicate wavefront scope.
1495 break;
1496 default:
1497 llvm_unreachable("Unsupported synchronization scope");
1498 }
1499 }
1500
1501 /// The scratch address space does not need the global memory caches
1502 /// to be bypassed as all memory operations by the same thread are
1503 /// sequentially consistent, and no other thread can access scratch
1504 /// memory.
1505
1506 /// Other address spaces do not have a cache.
1507
1508 return Changed;
1509}
1510
1511bool SIGfx940CacheControl::enableStoreCacheBypass(
1513 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1514 assert(!MI->mayLoad() && MI->mayStore());
1515 bool Changed = false;
1516
1517 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1518 switch (Scope) {
1519 case SIAtomicScope::SYSTEM:
1520 // Set SC bits to indicate system scope.
1521 Changed |= enableSC0Bit(MI);
1522 Changed |= enableSC1Bit(MI);
1523 break;
1524 case SIAtomicScope::AGENT:
1525 // Set SC bits to indicate agent scope.
1526 Changed |= enableSC1Bit(MI);
1527 break;
1528 case SIAtomicScope::WORKGROUP:
1529 // Set SC bits to indicate workgroup scope.
1530 Changed |= enableSC0Bit(MI);
1531 break;
1532 case SIAtomicScope::WAVEFRONT:
1533 case SIAtomicScope::SINGLETHREAD:
1534 // Leave SC bits unset to indicate wavefront scope.
1535 break;
1536 default:
1537 llvm_unreachable("Unsupported synchronization scope");
1538 }
1539 }
1540
1541 /// The scratch address space does not need the global memory caches
1542 /// to be bypassed as all memory operations by the same thread are
1543 /// sequentially consistent, and no other thread can access scratch
1544 /// memory.
1545
1546 /// Other address spaces do not have a cache.
1547
1548 return Changed;
1549}
1550
1551bool SIGfx940CacheControl::enableRMWCacheBypass(
1552 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1553 SIAtomicAddrSpace AddrSpace) const {
1554 assert(MI->mayLoad() && MI->mayStore());
1555 bool Changed = false;
1556
1557 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1558 switch (Scope) {
1559 case SIAtomicScope::SYSTEM:
1560 // Set SC1 bit to indicate system scope.
1561 Changed |= enableSC1Bit(MI);
1562 break;
1563 case SIAtomicScope::AGENT:
1564 case SIAtomicScope::WORKGROUP:
1565 case SIAtomicScope::WAVEFRONT:
1566 case SIAtomicScope::SINGLETHREAD:
1567 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1568 // to indicate system or agent scope. The SC0 bit is used to indicate if
1569 // they are return or no-return. Leave SC1 bit unset to indicate agent
1570 // scope.
1571 break;
1572 default:
1573 llvm_unreachable("Unsupported synchronization scope");
1574 }
1575 }
1576
1577 return Changed;
1578}
1579
1580bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1581 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1582 bool IsVolatile, bool IsNonTemporal) const {
1583 // Only handle load and store, not atomic read-modify-write insructions. The
1584 // latter use glc to indicate if the atomic returns a result and so must not
1585 // be used for cache control.
1586 assert(MI->mayLoad() ^ MI->mayStore());
1587
1588 // Only update load and store, not LLVM IR atomic read-modify-write
1589 // instructions. The latter are always marked as volatile so cannot sensibly
1590 // handle it as do not want to pessimize all atomics. Also they do not support
1591 // the nontemporal attribute.
1592 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1593
1594 bool Changed = false;
1595
1596 if (IsVolatile) {
1597 // Set SC bits to indicate system scope.
1598 Changed |= enableSC0Bit(MI);
1599 Changed |= enableSC1Bit(MI);
1600
1601 // Ensure operation has completed at system scope to cause all volatile
1602 // operations to be visible outside the program in a global order. Do not
1603 // request cross address space as only the global address space can be
1604 // observable outside the program, so no need to cause a waitcnt for LDS
1605 // address space operations.
1606 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1607 Position::AFTER);
1608
1609 return Changed;
1610 }
1611
1612 if (IsNonTemporal) {
1613 Changed |= enableNTBit(MI);
1614 return Changed;
1615 }
1616
1617 return Changed;
1618}
1619
1620bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1621 SIAtomicScope Scope,
1622 SIAtomicAddrSpace AddrSpace,
1623 Position Pos) const {
1624 if (!InsertCacheInv)
1625 return false;
1626
1627 bool Changed = false;
1628
1629 MachineBasicBlock &MBB = *MI->getParent();
1630 DebugLoc DL = MI->getDebugLoc();
1631
1632 if (Pos == Position::AFTER)
1633 ++MI;
1634
1635 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1636 switch (Scope) {
1637 case SIAtomicScope::SYSTEM:
1638 // Ensures that following loads will not see stale remote VMEM data or
1639 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1640 // CC will never be stale due to the local memory probes.
1641 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1642 // Set SC bits to indicate system scope.
1644 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1645 // hardware does not reorder memory operations by the same wave with
1646 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1647 // remove any cache lines of earlier writes by the same wave and ensures
1648 // later reads by the same wave will refetch the cache lines.
1649 Changed = true;
1650 break;
1651 case SIAtomicScope::AGENT:
1652 // Ensures that following loads will not see stale remote date or local
1653 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1654 // due to the memory probes.
1655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1656 // Set SC bits to indicate agent scope.
1658 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1659 // does not reorder memory operations with respect to preceeding buffer
1660 // invalidate. The invalidate is guaranteed to remove any cache lines of
1661 // earlier writes and ensures later writes will refetch the cache lines.
1662 Changed = true;
1663 break;
1664 case SIAtomicScope::WORKGROUP:
1665 // In threadgroup split mode the waves of a work-group can be executing on
1666 // different CUs. Therefore need to invalidate the L1 which is per CU.
1667 // Otherwise in non-threadgroup split mode all waves of a work-group are
1668 // on the same CU, and so the L1 does not need to be invalidated.
1669 if (ST.isTgSplitEnabled()) {
1670 // Ensures L1 is invalidated if in threadgroup split mode. In
1671 // non-threadgroup split mode it is a NOP, but no point generating it in
1672 // that case if know not in that mode.
1673 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1674 // Set SC bits to indicate work-group scope.
1676 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1677 // does not reorder memory operations with respect to preceeding buffer
1678 // invalidate. The invalidate is guaranteed to remove any cache lines of
1679 // earlier writes and ensures later writes will refetch the cache lines.
1680 Changed = true;
1681 }
1682 break;
1683 case SIAtomicScope::WAVEFRONT:
1684 case SIAtomicScope::SINGLETHREAD:
1685 // Could generate "BUFFER_INV" but it would do nothing as there are no
1686 // caches to invalidate.
1687 break;
1688 default:
1689 llvm_unreachable("Unsupported synchronization scope");
1690 }
1691 }
1692
1693 /// The scratch address space does not need the global memory cache
1694 /// to be flushed as all memory operations by the same thread are
1695 /// sequentially consistent, and no other thread can access scratch
1696 /// memory.
1697
1698 /// Other address spaces do not have a cache.
1699
1700 if (Pos == Position::AFTER)
1701 --MI;
1702
1703 return Changed;
1704}
1705
1706bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1707 SIAtomicScope Scope,
1708 SIAtomicAddrSpace AddrSpace,
1709 bool IsCrossAddrSpaceOrdering,
1710 Position Pos) const {
1711 bool Changed = false;
1712
1713 MachineBasicBlock &MBB = *MI->getParent();
1714 DebugLoc DL = MI->getDebugLoc();
1715
1716 if (Pos == Position::AFTER)
1717 ++MI;
1718
1719 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1720 switch (Scope) {
1721 case SIAtomicScope::SYSTEM:
1722 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1723 // hardware does not reorder memory operations by the same wave with
1724 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1725 // to initiate writeback of any dirty cache lines of earlier writes by the
1726 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1727 // writeback has completed.
1728 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1729 // Set SC bits to indicate system scope.
1731 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1732 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1733 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1734 Changed = true;
1735 break;
1736 case SIAtomicScope::AGENT:
1737 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1738 // Set SC bits to indicate agent scope.
1740
1741 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1742 // SIAtomicScope::AGENT, the following insertWait will generate the
1743 // required "S_WAITCNT vmcnt(0)".
1744 Changed = true;
1745 break;
1746 case SIAtomicScope::WORKGROUP:
1747 case SIAtomicScope::WAVEFRONT:
1748 case SIAtomicScope::SINGLETHREAD:
1749 // Do not generate "BUFFER_WBL2" as there are no caches it would
1750 // writeback, and would require an otherwise unnecessary
1751 // "S_WAITCNT vmcnt(0)".
1752 break;
1753 default:
1754 llvm_unreachable("Unsupported synchronization scope");
1755 }
1756 }
1757
1758 if (Pos == Position::AFTER)
1759 --MI;
1760
1761 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1762 // S_WAITCNT needed.
1763 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1764 IsCrossAddrSpaceOrdering, Pos);
1765
1766 return Changed;
1767}
1768
1769bool SIGfx10CacheControl::enableLoadCacheBypass(
1771 SIAtomicScope Scope,
1772 SIAtomicAddrSpace AddrSpace) const {
1773 assert(MI->mayLoad() && !MI->mayStore());
1774 bool Changed = false;
1775
1776 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1777 switch (Scope) {
1778 case SIAtomicScope::SYSTEM:
1779 case SIAtomicScope::AGENT:
1780 // Set the L0 and L1 cache policies to MISS_EVICT.
1781 // Note: there is no L2 cache coherent bypass control at the ISA level.
1782 Changed |= enableGLCBit(MI);
1783 Changed |= enableDLCBit(MI);
1784 break;
1785 case SIAtomicScope::WORKGROUP:
1786 // In WGP mode the waves of a work-group can be executing on either CU of
1787 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1788 // CU mode all waves of a work-group are on the same CU, and so the L0
1789 // does not need to be bypassed.
1790 if (!ST.isCuModeEnabled())
1791 Changed |= enableGLCBit(MI);
1792 break;
1793 case SIAtomicScope::WAVEFRONT:
1794 case SIAtomicScope::SINGLETHREAD:
1795 // No cache to bypass.
1796 break;
1797 default:
1798 llvm_unreachable("Unsupported synchronization scope");
1799 }
1800 }
1801
1802 /// The scratch address space does not need the global memory caches
1803 /// to be bypassed as all memory operations by the same thread are
1804 /// sequentially consistent, and no other thread can access scratch
1805 /// memory.
1806
1807 /// Other address spaces do not have a cache.
1808
1809 return Changed;
1810}
1811
1812bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1813 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1814 bool IsVolatile, bool IsNonTemporal) const {
1815
1816 // Only handle load and store, not atomic read-modify-write insructions. The
1817 // latter use glc to indicate if the atomic returns a result and so must not
1818 // be used for cache control.
1819 assert(MI->mayLoad() ^ MI->mayStore());
1820
1821 // Only update load and store, not LLVM IR atomic read-modify-write
1822 // instructions. The latter are always marked as volatile so cannot sensibly
1823 // handle it as do not want to pessimize all atomics. Also they do not support
1824 // the nontemporal attribute.
1825 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1826
1827 bool Changed = false;
1828
1829 if (IsVolatile) {
1830 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1831 // and MISS_LRU for store instructions.
1832 // Note: there is no L2 cache coherent bypass control at the ISA level.
1833 if (Op == SIMemOp::LOAD) {
1834 Changed |= enableGLCBit(MI);
1835 Changed |= enableDLCBit(MI);
1836 }
1837
1838 // Ensure operation has completed at system scope to cause all volatile
1839 // operations to be visible outside the program in a global order. Do not
1840 // request cross address space as only the global address space can be
1841 // observable outside the program, so no need to cause a waitcnt for LDS
1842 // address space operations.
1843 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1844 Position::AFTER);
1845 return Changed;
1846 }
1847
1848 if (IsNonTemporal) {
1849 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1850 // and L2 cache policy to STREAM.
1851 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1852 // to MISS_EVICT and the L2 cache policy to STREAM.
1853 if (Op == SIMemOp::STORE)
1854 Changed |= enableGLCBit(MI);
1855 Changed |= enableSLCBit(MI);
1856
1857 return Changed;
1858 }
1859
1860 return Changed;
1861}
1862
1863bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1864 SIAtomicScope Scope,
1865 SIAtomicAddrSpace AddrSpace,
1866 SIMemOp Op,
1867 bool IsCrossAddrSpaceOrdering,
1868 Position Pos) const {
1869 bool Changed = false;
1870
1871 MachineBasicBlock &MBB = *MI->getParent();
1872 DebugLoc DL = MI->getDebugLoc();
1873
1874 if (Pos == Position::AFTER)
1875 ++MI;
1876
1877 bool VMCnt = false;
1878 bool VSCnt = false;
1879 bool LGKMCnt = false;
1880
1881 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1882 SIAtomicAddrSpace::NONE) {
1883 switch (Scope) {
1884 case SIAtomicScope::SYSTEM:
1885 case SIAtomicScope::AGENT:
1886 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1887 VMCnt |= true;
1888 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1889 VSCnt |= true;
1890 break;
1891 case SIAtomicScope::WORKGROUP:
1892 // In WGP mode the waves of a work-group can be executing on either CU of
1893 // the WGP. Therefore need to wait for operations to complete to ensure
1894 // they are visible to waves in the other CU as the L0 is per CU.
1895 // Otherwise in CU mode and all waves of a work-group are on the same CU
1896 // which shares the same L0.
1897 if (!ST.isCuModeEnabled()) {
1898 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1899 VMCnt |= true;
1900 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1901 VSCnt |= true;
1902 }
1903 break;
1904 case SIAtomicScope::WAVEFRONT:
1905 case SIAtomicScope::SINGLETHREAD:
1906 // The L0 cache keeps all memory operations in order for
1907 // work-items in the same wavefront.
1908 break;
1909 default:
1910 llvm_unreachable("Unsupported synchronization scope");
1911 }
1912 }
1913
1914 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1915 switch (Scope) {
1916 case SIAtomicScope::SYSTEM:
1917 case SIAtomicScope::AGENT:
1918 case SIAtomicScope::WORKGROUP:
1919 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1920 // not needed as LDS operations for all waves are executed in a total
1921 // global ordering as observed by all waves. Required if also
1922 // synchronizing with global/GDS memory as LDS operations could be
1923 // reordered with respect to later global/GDS memory operations of the
1924 // same wave.
1925 LGKMCnt |= IsCrossAddrSpaceOrdering;
1926 break;
1927 case SIAtomicScope::WAVEFRONT:
1928 case SIAtomicScope::SINGLETHREAD:
1929 // The LDS keeps all memory operations in order for
1930 // the same wavefront.
1931 break;
1932 default:
1933 llvm_unreachable("Unsupported synchronization scope");
1934 }
1935 }
1936
1937 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1938 switch (Scope) {
1939 case SIAtomicScope::SYSTEM:
1940 case SIAtomicScope::AGENT:
1941 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1942 // is not needed as GDS operations for all waves are executed in a total
1943 // global ordering as observed by all waves. Required if also
1944 // synchronizing with global/LDS memory as GDS operations could be
1945 // reordered with respect to later global/LDS memory operations of the
1946 // same wave.
1947 LGKMCnt |= IsCrossAddrSpaceOrdering;
1948 break;
1949 case SIAtomicScope::WORKGROUP:
1950 case SIAtomicScope::WAVEFRONT:
1951 case SIAtomicScope::SINGLETHREAD:
1952 // The GDS keeps all memory operations in order for
1953 // the same work-group.
1954 break;
1955 default:
1956 llvm_unreachable("Unsupported synchronization scope");
1957 }
1958 }
1959
1960 if (VMCnt || LGKMCnt) {
1961 unsigned WaitCntImmediate =
1963 VMCnt ? 0 : getVmcntBitMask(IV),
1965 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1966 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1967 Changed = true;
1968 }
1969
1970 if (VSCnt) {
1971 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1972 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1973 .addImm(0);
1974 Changed = true;
1975 }
1976
1977 if (Pos == Position::AFTER)
1978 --MI;
1979
1980 return Changed;
1981}
1982
1983bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1984 SIAtomicScope Scope,
1985 SIAtomicAddrSpace AddrSpace,
1986 Position Pos) const {
1987 if (!InsertCacheInv)
1988 return false;
1989
1990 bool Changed = false;
1991
1992 MachineBasicBlock &MBB = *MI->getParent();
1993 DebugLoc DL = MI->getDebugLoc();
1994
1995 if (Pos == Position::AFTER)
1996 ++MI;
1997
1998 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1999 switch (Scope) {
2000 case SIAtomicScope::SYSTEM:
2001 case SIAtomicScope::AGENT:
2002 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2003 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2004 Changed = true;
2005 break;
2006 case SIAtomicScope::WORKGROUP:
2007 // In WGP mode the waves of a work-group can be executing on either CU of
2008 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2009 // in CU mode and all waves of a work-group are on the same CU, and so the
2010 // L0 does not need to be invalidated.
2011 if (!ST.isCuModeEnabled()) {
2012 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2013 Changed = true;
2014 }
2015 break;
2016 case SIAtomicScope::WAVEFRONT:
2017 case SIAtomicScope::SINGLETHREAD:
2018 // No cache to invalidate.
2019 break;
2020 default:
2021 llvm_unreachable("Unsupported synchronization scope");
2022 }
2023 }
2024
2025 /// The scratch address space does not need the global memory cache
2026 /// to be flushed as all memory operations by the same thread are
2027 /// sequentially consistent, and no other thread can access scratch
2028 /// memory.
2029
2030 /// Other address spaces do not have a cache.
2031
2032 if (Pos == Position::AFTER)
2033 --MI;
2034
2035 return Changed;
2036}
2037
2038bool SIGfx11CacheControl::enableLoadCacheBypass(
2039 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2040 SIAtomicAddrSpace AddrSpace) const {
2041 assert(MI->mayLoad() && !MI->mayStore());
2042 bool Changed = false;
2043
2044 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2045 switch (Scope) {
2046 case SIAtomicScope::SYSTEM:
2047 case SIAtomicScope::AGENT:
2048 // Set the L0 and L1 cache policies to MISS_EVICT.
2049 // Note: there is no L2 cache coherent bypass control at the ISA level.
2050 Changed |= enableGLCBit(MI);
2051 break;
2052 case SIAtomicScope::WORKGROUP:
2053 // In WGP mode the waves of a work-group can be executing on either CU of
2054 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2055 // CU mode all waves of a work-group are on the same CU, and so the L0
2056 // does not need to be bypassed.
2057 if (!ST.isCuModeEnabled())
2058 Changed |= enableGLCBit(MI);
2059 break;
2060 case SIAtomicScope::WAVEFRONT:
2061 case SIAtomicScope::SINGLETHREAD:
2062 // No cache to bypass.
2063 break;
2064 default:
2065 llvm_unreachable("Unsupported synchronization scope");
2066 }
2067 }
2068
2069 /// The scratch address space does not need the global memory caches
2070 /// to be bypassed as all memory operations by the same thread are
2071 /// sequentially consistent, and no other thread can access scratch
2072 /// memory.
2073
2074 /// Other address spaces do not have a cache.
2075
2076 return Changed;
2077}
2078
2079bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2080 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2081 bool IsVolatile, bool IsNonTemporal) const {
2082
2083 // Only handle load and store, not atomic read-modify-write insructions. The
2084 // latter use glc to indicate if the atomic returns a result and so must not
2085 // be used for cache control.
2086 assert(MI->mayLoad() ^ MI->mayStore());
2087
2088 // Only update load and store, not LLVM IR atomic read-modify-write
2089 // instructions. The latter are always marked as volatile so cannot sensibly
2090 // handle it as do not want to pessimize all atomics. Also they do not support
2091 // the nontemporal attribute.
2092 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2093
2094 bool Changed = false;
2095
2096 if (IsVolatile) {
2097 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2098 // and MISS_LRU for store instructions.
2099 // Note: there is no L2 cache coherent bypass control at the ISA level.
2100 if (Op == SIMemOp::LOAD)
2101 Changed |= enableGLCBit(MI);
2102
2103 // Set MALL NOALLOC for load and store instructions.
2104 Changed |= enableDLCBit(MI);
2105
2106 // Ensure operation has completed at system scope to cause all volatile
2107 // operations to be visible outside the program in a global order. Do not
2108 // request cross address space as only the global address space can be
2109 // observable outside the program, so no need to cause a waitcnt for LDS
2110 // address space operations.
2111 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2112 Position::AFTER);
2113 return Changed;
2114 }
2115
2116 if (IsNonTemporal) {
2117 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2118 // and L2 cache policy to STREAM.
2119 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2120 // to MISS_EVICT and the L2 cache policy to STREAM.
2121 if (Op == SIMemOp::STORE)
2122 Changed |= enableGLCBit(MI);
2123 Changed |= enableSLCBit(MI);
2124
2125 // Set MALL NOALLOC for load and store instructions.
2126 Changed |= enableDLCBit(MI);
2127 return Changed;
2128 }
2129
2130 return Changed;
2131}
2132
2133bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2134 if (AtomicPseudoMIs.empty())
2135 return false;
2136
2137 for (auto &MI : AtomicPseudoMIs)
2138 MI->eraseFromParent();
2139
2140 AtomicPseudoMIs.clear();
2141 return true;
2142}
2143
2144bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2146 assert(MI->mayLoad() && !MI->mayStore());
2147
2148 bool Changed = false;
2149
2150 if (MOI.isAtomic()) {
2151 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2152 MOI.getOrdering() == AtomicOrdering::Acquire ||
2153 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2154 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2155 MOI.getOrderingAddrSpace());
2156 }
2157
2158 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2159 Changed |= CC->insertWait(MI, MOI.getScope(),
2160 MOI.getOrderingAddrSpace(),
2161 SIMemOp::LOAD | SIMemOp::STORE,
2162 MOI.getIsCrossAddressSpaceOrdering(),
2163 Position::BEFORE);
2164
2165 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2166 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2167 Changed |= CC->insertWait(MI, MOI.getScope(),
2168 MOI.getInstrAddrSpace(),
2169 SIMemOp::LOAD,
2170 MOI.getIsCrossAddressSpaceOrdering(),
2171 Position::AFTER);
2172 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2173 MOI.getOrderingAddrSpace(),
2174 Position::AFTER);
2175 }
2176
2177 return Changed;
2178 }
2179
2180 // Atomic instructions already bypass caches to the scope specified by the
2181 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2182 // need additional treatment.
2183 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2184 SIMemOp::LOAD, MOI.isVolatile(),
2185 MOI.isNonTemporal());
2186 return Changed;
2187}
2188
2189bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2191 assert(!MI->mayLoad() && MI->mayStore());
2192
2193 bool Changed = false;
2194
2195 if (MOI.isAtomic()) {
2196 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2197 MOI.getOrdering() == AtomicOrdering::Release ||
2198 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2199 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2200 MOI.getOrderingAddrSpace());
2201 }
2202
2203 if (MOI.getOrdering() == AtomicOrdering::Release ||
2204 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2205 Changed |= CC->insertRelease(MI, MOI.getScope(),
2206 MOI.getOrderingAddrSpace(),
2207 MOI.getIsCrossAddressSpaceOrdering(),
2208 Position::BEFORE);
2209
2210 return Changed;
2211 }
2212
2213 // Atomic instructions already bypass caches to the scope specified by the
2214 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2215 // need additional treatment.
2216 Changed |= CC->enableVolatileAndOrNonTemporal(
2217 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2218 MOI.isNonTemporal());
2219 return Changed;
2220}
2221
2222bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2224 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2225
2226 AtomicPseudoMIs.push_back(MI);
2227 bool Changed = false;
2228
2229 if (MOI.isAtomic()) {
2230 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2231 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2232 SIMemOp::LOAD | SIMemOp::STORE,
2233 MOI.getIsCrossAddressSpaceOrdering(),
2234 Position::BEFORE);
2235
2236 if (MOI.getOrdering() == AtomicOrdering::Release ||
2237 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2238 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2239 /// TODO: This relies on a barrier always generating a waitcnt
2240 /// for LDS to ensure it is not reordered with the completion of
2241 /// the proceeding LDS operations. If barrier had a memory
2242 /// ordering and memory scope, then library does not need to
2243 /// generate a fence. Could add support in this file for
2244 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2245 /// adding S_WAITCNT before a S_BARRIER.
2246 Changed |= CC->insertRelease(MI, MOI.getScope(),
2247 MOI.getOrderingAddrSpace(),
2248 MOI.getIsCrossAddressSpaceOrdering(),
2249 Position::BEFORE);
2250
2251 // TODO: If both release and invalidate are happening they could be combined
2252 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2253 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2254 // track cache invalidate and write back instructions.
2255
2256 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2257 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2258 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2259 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2260 MOI.getOrderingAddrSpace(),
2261 Position::BEFORE);
2262
2263 return Changed;
2264 }
2265
2266 return Changed;
2267}
2268
2269bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2271 assert(MI->mayLoad() && MI->mayStore());
2272
2273 bool Changed = false;
2274
2275 if (MOI.isAtomic()) {
2276 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2277 MOI.getOrdering() == AtomicOrdering::Acquire ||
2278 MOI.getOrdering() == AtomicOrdering::Release ||
2279 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2280 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2281 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2282 MOI.getInstrAddrSpace());
2283 }
2284
2285 if (MOI.getOrdering() == AtomicOrdering::Release ||
2286 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2287 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2288 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2289 Changed |= CC->insertRelease(MI, MOI.getScope(),
2290 MOI.getOrderingAddrSpace(),
2291 MOI.getIsCrossAddressSpaceOrdering(),
2292 Position::BEFORE);
2293
2294 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2295 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2296 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2297 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2298 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2299 Changed |= CC->insertWait(MI, MOI.getScope(),
2300 MOI.getInstrAddrSpace(),
2301 isAtomicRet(*MI) ? SIMemOp::LOAD :
2302 SIMemOp::STORE,
2303 MOI.getIsCrossAddressSpaceOrdering(),
2304 Position::AFTER);
2305 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2306 MOI.getOrderingAddrSpace(),
2307 Position::AFTER);
2308 }
2309
2310 return Changed;
2311 }
2312
2313 return Changed;
2314}
2315
2316bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2317 bool Changed = false;
2318
2319 SIMemOpAccess MOA(MF);
2320 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2321
2322 for (auto &MBB : MF) {
2323 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2324
2325 // Unbundle instructions after the post-RA scheduler.
2326 if (MI->isBundle() && MI->mayLoadOrStore()) {
2327 MachineBasicBlock::instr_iterator II(MI->getIterator());
2329 I != E && I->isBundledWithPred(); ++I) {
2330 I->unbundleFromPred();
2331 for (MachineOperand &MO : I->operands())
2332 if (MO.isReg())
2333 MO.setIsInternalRead(false);
2334 }
2335
2336 MI->eraseFromParent();
2337 MI = II->getIterator();
2338 }
2339
2340 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2341 continue;
2342
2343 if (const auto &MOI = MOA.getLoadInfo(MI))
2344 Changed |= expandLoad(*MOI, MI);
2345 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2346 Changed |= expandStore(*MOI, MI);
2347 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2348 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2349 Changed |= expandAtomicFence(*MOI, MI);
2350 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2351 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2352 }
2353 }
2354
2355 Changed |= removeAtomicPseudoMIs();
2356 return Changed;
2357}
2358
2359INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2360
2361char SIMemoryLegalizer::ID = 0;
2362char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2363
2365 return new SIMemoryLegalizer();
2366}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
Atomic ordering constants.
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:82
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
#define DEBUG_TYPE
static SPIRV::Scope::Scope getScope(SyncScope::ID Ord)
#define PASS_NAME
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineModuleInfo & getMMI() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Iterator for intrusive lists based on ilist_node.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:379
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:382
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:377
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:378
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:383
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
@ NONE
Definition: Attributor.h:6385
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionPass * createSIMemoryLegalizerPass()
Instruction set architecture version.
Definition: TargetParser.h:114