LLVM 23.0.0git
AMDGPULowerIntrinsics.cpp
Go to the documentation of this file.
1//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower intrinsics that would otherwise require separate handling in both
10// SelectionDAG and GlobalISel.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUTargetMachine.h"
16#include "GCNSubtarget.h"
17#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/IntrinsicsAMDGPU.h"
22
23#define DEBUG_TYPE "amdgpu-lower-intrinsics"
24
25using namespace llvm;
26
27namespace {
28
29class AMDGPULowerIntrinsicsImpl {
30public:
31 Module &M;
32 const AMDGPUTargetMachine &TM;
33
34 AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
35 : M(M), TM(TM) {}
36
37 bool run();
38
39private:
40 bool visitBarrier(IntrinsicInst &I);
41};
42
43class AMDGPULowerIntrinsicsLegacy : public ModulePass {
44public:
45 static char ID;
46
47 AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
48
49 bool runOnModule(Module &M) override;
50
51 void getAnalysisUsage(AnalysisUsage &AU) const override {
53 }
54};
55
56template <class T> static void forEachCall(Function &Intrin, T Callback) {
57 for (User *U : make_early_inc_range(Intrin.users())) {
58 if (auto *CI = dyn_cast<IntrinsicInst>(U))
59 Callback(CI);
60 }
61}
62
63} // anonymous namespace
64
65bool AMDGPULowerIntrinsicsImpl::run() {
66 bool Changed = false;
67
68 for (Function &F : M) {
69 switch (F.getIntrinsicID()) {
70 default:
71 continue;
72 case Intrinsic::amdgcn_s_barrier:
73 case Intrinsic::amdgcn_s_barrier_signal:
74 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
75 case Intrinsic::amdgcn_s_barrier_wait:
76 case Intrinsic::amdgcn_s_cluster_barrier:
77 forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
78 break;
79 }
80 }
81
82 return Changed;
83}
84
85// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
86// barrier intrinsics.
87bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
88 assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
89 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
90 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
91 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
92 I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
93
94 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
95 bool IsSingleWaveWG = false;
96
97 if (TM.getOptLevel() > CodeGenOptLevel::None)
98 IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction());
99
100 IRBuilder<> B(&I);
101
102 // Lower the s_cluster_barrier intrinsic first. There is no corresponding
103 // hardware instruction in any subtarget.
104 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
105 // The default cluster barrier expects one signal per workgroup. So we need
106 // a workgroup barrier first.
107 if (IsSingleWaveWG) {
108 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {})
109 ->copyMetadata(I);
110 } else {
111 Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
112 Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
113 CallInst *IsFirst = B.CreateIntrinsic(
114 B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
115 {BarrierID_32});
116 IsFirst->copyMetadata(I);
117 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
118 {BarrierID_16})
119 ->copyMetadata(I);
120
121 Instruction *ThenTerm =
122 SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
123 B.SetInsertPoint(ThenTerm);
124 }
125
126 // Now we can signal the cluster barrier from a single wave and wait for the
127 // barrier in all waves.
128 Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
129 Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
130 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
131 {BarrierID_32})
132 ->copyMetadata(I);
133
134 B.SetInsertPoint(&I);
135 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
136 {BarrierID_16})
137 ->copyMetadata(I);
138
139 I.eraseFromParent();
140 return true;
141 }
142
143 bool IsWorkgroupScope = false;
144
145 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
146 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
147 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
148 int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
149 if (BarrierID == AMDGPU::Barrier::TRAP ||
150 BarrierID == AMDGPU::Barrier::WORKGROUP ||
153 IsWorkgroupScope = true;
154 } else {
155 assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
156 IsWorkgroupScope = true;
157 }
158
159 if (IsWorkgroupScope && IsSingleWaveWG) {
160 // Down-grade waits, remove split signals.
161 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
162 I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
163 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {})
164 ->copyMetadata(I);
165 } else if (I.getIntrinsicID() ==
166 Intrinsic::amdgcn_s_barrier_signal_isfirst) {
167 // If we're the only wave of the workgroup, we're always first.
168 I.replaceAllUsesWith(B.getInt1(true));
169 }
170 I.eraseFromParent();
171 return true;
172 }
173
174 if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
175 ST.hasSplitBarriers()) {
176 // Lower to split barriers.
177 Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
178 Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
179 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
180 {BarrierID_32})
181 ->copyMetadata(I);
182 B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
183 {BarrierID_16})
184 ->copyMetadata(I);
185 I.eraseFromParent();
186 return true;
187 }
188
189 return false;
190}
191
194 AMDGPULowerIntrinsicsImpl Impl(M, TM);
195 if (!Impl.run())
196 return PreservedAnalyses::all();
198}
199
200bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
201 auto &TPC = getAnalysis<TargetPassConfig>();
202 const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
203
204 AMDGPULowerIntrinsicsImpl Impl(M, TM);
205 return Impl.run();
206}
207
208#define PASS_DESC "AMDGPU lower intrinsics"
209INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
210 false)
212INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
213 false)
214
215char AMDGPULowerIntrinsicsLegacy::ID = 0;
216
218 return new AMDGPULowerIntrinsicsLegacy;
219}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
ModuleAnalysisManager MAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static bool forEachCall(Function &Intrin, T Callback)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
A wrapper class for inspecting calls to intrinsic functions.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
iterator_range< user_iterator > users()
Definition Value.h:426
Changed
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
ModulePass * createAMDGPULowerIntrinsicsLegacyPass()
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)