22#include "llvm/IR/IntrinsicsAMDGPU.h"
27#define DEBUG_TYPE "amdgpu-atomic-optimizer"
34struct ReplacementInfo {
52 Value *
const Identity)
const;
54 Value *
const Identity)
const;
57 bool ValDivergent)
const;
78char AMDGPUAtomicOptimizer::ID = 0;
82bool AMDGPUAtomicOptimizer::runOnFunction(
Function &
F) {
83 if (skipFunction(
F)) {
87 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
88 DL = &
F.getParent()->getDataLayout();
90 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
99 const bool Changed = !ToReplace.empty();
101 for (ReplacementInfo &
Info : ToReplace) {
112 switch (
I.getPointerAddressSpace()) {
137 const unsigned PtrIdx = 0;
138 const unsigned ValIdx = 1;
142 if (UA->isDivergentUse(
I.getOperandUse(PtrIdx))) {
146 const bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
153 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
160 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
162 ToReplace.push_back(Info);
168 switch (
I.getIntrinsicID()) {
171 case Intrinsic::amdgcn_buffer_atomic_add:
172 case Intrinsic::amdgcn_struct_buffer_atomic_add:
173 case Intrinsic::amdgcn_raw_buffer_atomic_add:
176 case Intrinsic::amdgcn_buffer_atomic_sub:
177 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
181 case Intrinsic::amdgcn_buffer_atomic_and:
182 case Intrinsic::amdgcn_struct_buffer_atomic_and:
183 case Intrinsic::amdgcn_raw_buffer_atomic_and:
186 case Intrinsic::amdgcn_buffer_atomic_or:
187 case Intrinsic::amdgcn_struct_buffer_atomic_or:
188 case Intrinsic::amdgcn_raw_buffer_atomic_or:
191 case Intrinsic::amdgcn_buffer_atomic_xor:
192 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
193 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
196 case Intrinsic::amdgcn_buffer_atomic_smin:
197 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
198 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
201 case Intrinsic::amdgcn_buffer_atomic_umin:
202 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
203 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
206 case Intrinsic::amdgcn_buffer_atomic_smax:
207 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
208 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
211 case Intrinsic::amdgcn_buffer_atomic_umax:
212 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
213 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
218 const unsigned ValIdx = 0;
220 const bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
227 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
233 for (
unsigned Idx = 1;
Idx <
I.getNumOperands();
Idx++) {
234 if (UA->isDivergentUse(
I.getOperandUse(
Idx))) {
242 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
244 ToReplace.push_back(Info);
257 return B.CreateBinOp(Instruction::Add,
LHS,
RHS);
259 return B.CreateBinOp(Instruction::Sub,
LHS,
RHS);
261 return B.CreateBinOp(Instruction::And,
LHS,
RHS);
263 return B.CreateBinOp(Instruction::Or,
LHS,
RHS);
265 return B.CreateBinOp(Instruction::Xor,
LHS,
RHS);
288 Value *
const Identity)
const {
289 Type *
const Ty =
V->getType();
290 Module *
M =
B.GetInsertBlock()->getModule();
298 B.CreateCall(UpdateDPP,
299 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
300 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
308 Intrinsic::amdgcn_permlanex16, {},
309 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
314 if (
ST->hasPermLane64()) {
317 B, Op, V,
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
324 Value *
const Lane0 =
B.CreateCall(ReadLane, {
V,
B.getInt32(0)});
325 Value *
const Lane32 =
B.CreateCall(ReadLane, {
V,
B.getInt32(32)});
333 Type *
const Ty =
V->getType();
334 Module *
M =
B.GetInsertBlock()->getModule();
341 B.CreateCall(UpdateDPP,
342 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
343 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
345 if (
ST->hasDPPBroadcasts()) {
349 B.CreateCall(UpdateDPP,
350 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
351 B.getInt32(0xf), B.getFalse()}));
354 B.CreateCall(UpdateDPP,
355 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
356 B.getInt32(0xf), B.getFalse()}));
364 Value *
const PermX =
B.CreateIntrinsic(
365 Intrinsic::amdgcn_permlanex16, {},
366 {
V,
V,
B.getInt32(-1),
B.getInt32(-1),
B.getFalse(),
B.getFalse()});
369 B.CreateCall(UpdateDPP,
370 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
371 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
372 if (!
ST->isWave32()) {
374 Value *
const Lane31 =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
375 {
V,
B.getInt32(31)});
378 B.CreateCall(UpdateDPP,
379 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
380 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
389 Value *
const Identity)
const {
390 Type *
const Ty =
V->getType();
391 Module *
M =
B.GetInsertBlock()->getModule();
395 if (
ST->hasDPPWavefrontShifts()) {
397 V =
B.CreateCall(UpdateDPP,
399 B.getInt32(0xf),
B.getFalse()});
409 V =
B.CreateCall(UpdateDPP,
411 B.getInt32(0xf),
B.getInt32(0xf),
B.getFalse()});
414 V =
B.CreateCall(WriteLane, {
B.CreateCall(ReadLane, {Old,
B.getInt32(15)}),
417 if (!
ST->isWave32()) {
421 {
B.CreateCall(ReadLane, {Old,
B.getInt32(31)}),
B.getInt32(32), V});
426 {
B.CreateCall(ReadLane, {Old,
B.getInt32(47)}),
B.getInt32(48), V});
459void AMDGPUAtomicOptimizer::optimizeAtomic(
Instruction &
I,
462 bool ValDivergent)
const {
478 PixelEntryBB =
I.getParent();
480 Value *
const Cond =
B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
485 PixelExitBB =
I.getParent();
487 I.moveBefore(NonHelperTerminator);
488 B.SetInsertPoint(&
I);
491 Type *
const Ty =
I.getType();
492 const unsigned TyBitWidth =
DL->getTypeSizeInBits(Ty);
497 Value *
const V =
I.getOperand(ValIdx);
501 Type *
const WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
503 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
510 if (
ST->isWave32()) {
511 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512 {Ballot,
B.getInt32(0)});
514 Value *
const BitCast =
B.CreateBitCast(Ballot, VecTy);
515 Value *
const ExtractLo =
B.CreateExtractElement(BitCast,
B.getInt32(0));
516 Value *
const ExtractHi =
B.CreateExtractElement(BitCast,
B.getInt32(1));
517 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
518 {ExtractLo,
B.getInt32(0)});
520 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
522 Mbcnt =
B.CreateIntCast(Mbcnt, Ty,
false);
526 Value *ExclScan =
nullptr;
527 Value *NewV =
nullptr;
529 const bool NeedResult = !
I.use_empty();
536 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {
V, Identity});
540 if (!NeedResult &&
ST->hasPermLaneX16()) {
544 NewV = buildReduction(
B, ScanOp, NewV, Identity);
546 NewV = buildScan(
B, ScanOp, NewV, Identity);
548 ExclScan = buildShiftRight(
B, NewV, Identity);
553 Value *
const LastLaneIdx =
B.getInt32(
ST->getWavefrontSize() - 1);
555 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
556 {NewV, LastLaneIdx});
560 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
570 Value *
const Ctpop =
B.CreateIntCast(
571 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
590 Value *
const Ctpop =
B.CreateIntCast(
591 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
600 Value *
const Cond =
B.CreateICmpEQ(Mbcnt,
B.getIntN(TyBitWidth, 0));
614 B.SetInsertPoint(SingleLaneTerminator);
624 B.SetInsertPoint(&
I);
630 PHI->addIncoming(NewI, SingleLaneTerminator->
getParent());
635 Value *BroadcastI =
nullptr;
637 if (TyBitWidth == 64) {
638 Value *
const ExtractLo =
B.CreateTrunc(
PHI,
B.getInt32Ty());
639 Value *
const ExtractHi =
640 B.CreateTrunc(
B.CreateLShr(
PHI, 32),
B.getInt32Ty());
642 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
644 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
645 Value *
const PartialInsert =
B.CreateInsertElement(
648 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi,
B.getInt32(1));
649 BroadcastI =
B.CreateBitCast(Insert, Ty);
650 }
else if (TyBitWidth == 32) {
652 BroadcastI =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {},
PHI);
661 Value *LaneOffset =
nullptr;
664 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
679 LaneOffset =
B.CreateSelect(
Cond, Identity, V);
682 LaneOffset =
buildMul(
B, V,
B.CreateAnd(Mbcnt, 1));
694 PHI->addIncoming(Result,
I.getParent());
695 I.replaceAllUsesWith(
PHI);
698 I.replaceAllUsesWith(Result);
707 "AMDGPU atomic optimizations",
false,
false)
714 return new AMDGPUAtomicOptimizer();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth)
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
AMDGPU atomic optimizations
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
AMD GCN specific subclass of TargetSubtarget.
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
LLVM Basic Block Representation.
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
This is the shared class of boolean and integer constants.
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
A parsed version of the target data layout string in and methods for querying it.
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitAtomicRMWInst(AtomicRMWInst &I)
const BasicBlock * getParent() const
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
void setOperand(unsigned i, Value *Val)
LLVM Value Representation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUAtomicOptimizerPass()
constexpr unsigned BitWidth
char & AMDGPUAtomicOptimizerID
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights, DominatorTree *DT, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...