31#include "llvm/IR/IntrinsicsAMDGPU.h"
36#define DEBUG_TYPE "amdgpu-atomic-optimizer"
43struct ReplacementInfo {
66class AMDGPUAtomicOptimizerImpl
78 Value *
const Identity)
const;
80 Value *
const Identity)
const;
83 std::pair<Value *, Value *>
89 bool ValDivergent)
const;
92 AMDGPUAtomicOptimizerImpl() =
delete;
97 : UA(UA),
DL(
DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
108char AMDGPUAtomicOptimizer::ID = 0;
112bool AMDGPUAtomicOptimizer::runOnFunction(
Function &
F) {
113 if (skipFunction(
F)) {
118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
122 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
124 DomTreeUpdater::UpdateStrategy::Lazy);
132 return AMDGPUAtomicOptimizerImpl(UA,
DL, DTU, ST, IsPixelShader, ScanImpl)
149 AMDGPUAtomicOptimizerImpl(UA,
DL, DTU, ST, IsPixelShader, ScanImpl)
161bool AMDGPUAtomicOptimizerImpl::run(
Function &
F) {
170 const bool Changed = !ToReplace.empty();
172 for (ReplacementInfo &Info : ToReplace) {
181void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(
AtomicRMWInst &
I) {
183 switch (
I.getPointerAddressSpace()) {
217 const unsigned PtrIdx = 0;
218 const unsigned ValIdx = 1;
222 if (UA->isDivergentUse(
I.getOperandUse(PtrIdx))) {
226 const bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
233 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
240 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
242 ToReplace.push_back(Info);
245void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
248 switch (
I.getIntrinsicID()) {
251 case Intrinsic::amdgcn_buffer_atomic_add:
252 case Intrinsic::amdgcn_struct_buffer_atomic_add:
253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
254 case Intrinsic::amdgcn_raw_buffer_atomic_add:
255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
258 case Intrinsic::amdgcn_buffer_atomic_sub:
259 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
261 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
262 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
265 case Intrinsic::amdgcn_buffer_atomic_and:
266 case Intrinsic::amdgcn_struct_buffer_atomic_and:
267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
268 case Intrinsic::amdgcn_raw_buffer_atomic_and:
269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
272 case Intrinsic::amdgcn_buffer_atomic_or:
273 case Intrinsic::amdgcn_struct_buffer_atomic_or:
274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
275 case Intrinsic::amdgcn_raw_buffer_atomic_or:
276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
279 case Intrinsic::amdgcn_buffer_atomic_xor:
280 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
282 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
286 case Intrinsic::amdgcn_buffer_atomic_smin:
287 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
289 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
290 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
293 case Intrinsic::amdgcn_buffer_atomic_umin:
294 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
296 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
300 case Intrinsic::amdgcn_buffer_atomic_smax:
301 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
302 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
303 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
307 case Intrinsic::amdgcn_buffer_atomic_umax:
308 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
309 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
310 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
316 const unsigned ValIdx = 0;
318 const bool ValDivergent = UA->isDivergentUse(
I.getOperandUse(ValIdx));
325 (!
ST->hasDPP() ||
DL->getTypeSizeInBits(
I.getType()) != 32)) {
331 for (
unsigned Idx = 1;
Idx <
I.getNumOperands();
Idx++) {
332 if (UA->isDivergentUse(
I.getOperandUse(
Idx))) {
340 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
342 ToReplace.push_back(Info);
355 return B.CreateBinOp(Instruction::Add,
LHS,
RHS);
359 return B.CreateBinOp(Instruction::Sub,
LHS,
RHS);
363 return B.CreateBinOp(Instruction::And,
LHS,
RHS);
365 return B.CreateBinOp(Instruction::Or,
LHS,
RHS);
367 return B.CreateBinOp(Instruction::Xor,
LHS,
RHS);
382 return B.CreateMaxNum(
LHS,
RHS);
384 return B.CreateMinNum(
LHS,
RHS);
395 Value *
const Identity)
const {
396 Type *AtomicTy =
V->getType();
398 Module *
M =
B.GetInsertBlock()->getModule();
406 B.CreateCall(UpdateDPP,
407 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
408 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
413 V =
B.CreateBitCast(V, IntNTy);
414 Value *Permlanex16Call =
B.CreateIntrinsic(
415 Intrinsic::amdgcn_permlanex16, {},
416 {
V,
V,
B.getInt32(-1),
B.getInt32(-1),
B.getFalse(),
B.getFalse()});
418 B.CreateBitCast(Permlanex16Call, AtomicTy));
419 if (
ST->isWave32()) {
423 if (
ST->hasPermLane64()) {
425 V =
B.CreateBitCast(V, IntNTy);
426 Value *Permlane64Call =
427 B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {},
V);
429 B.CreateBitCast(Permlane64Call, AtomicTy));
436 V =
B.CreateBitCast(V, IntNTy);
437 Value *Lane0 =
B.CreateCall(ReadLane, {
V,
B.getInt32(0)});
438 Value *Lane32 =
B.CreateCall(ReadLane, {
V,
B.getInt32(32)});
440 B.CreateBitCast(Lane32, AtomicTy));
447 Value *Identity)
const {
448 Type *AtomicTy =
V->getType();
451 Module *
M =
B.GetInsertBlock()->getModule();
458 B.CreateCall(UpdateDPP,
459 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
460 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
462 if (
ST->hasDPPBroadcasts()) {
466 B.CreateCall(UpdateDPP,
467 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
468 B.getInt32(0xf), B.getFalse()}));
471 B.CreateCall(UpdateDPP,
472 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
473 B.getInt32(0xf), B.getFalse()}));
481 V =
B.CreateBitCast(V, IntNTy);
482 Value *PermX =
B.CreateIntrinsic(
483 Intrinsic::amdgcn_permlanex16, {},
484 {
V,
V,
B.getInt32(-1),
B.getInt32(-1),
B.getFalse(),
B.getFalse()});
486 Value *UpdateDPPCall =
487 B.CreateCall(UpdateDPP, {Identity,
B.CreateBitCast(PermX, AtomicTy),
489 B.getInt32(0xf),
B.getFalse()});
492 if (!
ST->isWave32()) {
494 V =
B.CreateBitCast(V, IntNTy);
495 Value *
const Lane31 =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
496 {
V,
B.getInt32(31)});
498 Value *UpdateDPPCall =
B.CreateCall(
500 B.getInt32(0xc),
B.getInt32(0xf),
B.getFalse()});
512 Value *Identity)
const {
513 Type *AtomicTy =
V->getType();
516 Module *
M =
B.GetInsertBlock()->getModule();
519 if (
ST->hasDPPWavefrontShifts()) {
521 V =
B.CreateCall(UpdateDPP,
523 B.getInt32(0xf),
B.getFalse()});
533 V =
B.CreateCall(UpdateDPP,
535 B.getInt32(0xf),
B.getInt32(0xf),
B.getFalse()});
540 {
B.CreateCall(ReadLane, {
B.CreateBitCast(Old, IntNTy),
B.getInt32(15)}),
541 B.getInt32(16),
B.CreateBitCast(V, IntNTy)});
542 V =
B.CreateBitCast(V, AtomicTy);
543 if (!
ST->isWave32()) {
545 V =
B.CreateBitCast(V, IntNTy);
546 V =
B.CreateCall(WriteLane,
547 {
B.CreateCall(ReadLane, {
B.CreateBitCast(Old, IntNTy),
554 {
B.CreateCall(ReadLane, {Old,
B.getInt32(47)}),
B.getInt32(48), V});
555 V =
B.CreateBitCast(V, AtomicTy);
567std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
570 auto *Ty =
I.getType();
571 auto *WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
572 auto *EntryBB =
I.getParent();
573 auto NeedResult = !
I.use_empty();
576 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
579 B.SetInsertPoint(ComputeLoop);
583 PHINode *OldValuePhi =
nullptr;
585 OldValuePhi =
B.CreatePHI(Ty, 2,
"OldValuePhi");
588 auto *ActiveBits =
B.CreatePHI(WaveTy, 2,
"ActiveBits");
589 ActiveBits->addIncoming(Ballot, EntryBB);
593 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits,
B.getTrue()});
595 Type *IntNTy =
B.getIntNTy(Ty->getPrimitiveSizeInBits());
596 auto *LaneIdxInt =
B.CreateTrunc(FF1, IntNTy);
599 V =
B.CreateBitCast(V, IntNTy);
601 B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {
V, LaneIdxInt});
602 LaneValue =
B.CreateBitCast(LaneValue, Ty);
606 Value *OldValue =
nullptr;
609 B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
611 B.CreateBitCast(OldValuePhi, IntNTy)});
612 OldValue =
B.CreateBitCast(OldValue, Ty);
618 Accumulator->addIncoming(NewAccumulator, ComputeLoop);
625 auto *NewActiveBits =
B.CreateAnd(ActiveBits, InverseMask);
626 ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
630 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
632 B.SetInsertPoint(ComputeEnd);
634 return {OldValue, NewAccumulator};
673void AMDGPUAtomicOptimizerImpl::optimizeAtomic(
Instruction &
I,
676 bool ValDivergent)
const {
681 B.setIsFPConstrained(
I.getFunction()->hasFnAttribute(Attribute::StrictFP));
696 PixelEntryBB =
I.getParent();
698 Value *
const Cond =
B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
703 PixelExitBB =
I.getParent();
705 I.moveBefore(NonHelperTerminator);
706 B.SetInsertPoint(&
I);
709 Type *
const Ty =
I.getType();
713 const unsigned TyBitWidth =
DL->getTypeSizeInBits(Ty);
718 Value *
V =
I.getOperand(ValIdx);
722 Type *
const WaveTy =
B.getIntNTy(
ST->getWavefrontSize());
724 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy,
B.getTrue());
731 if (
ST->isWave32()) {
732 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
733 {Ballot,
B.getInt32(0)});
736 Value *
const ExtractHi =
B.CreateTrunc(
B.CreateLShr(Ballot, 32),
Int32Ty);
737 Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
738 {ExtractLo,
B.getInt32(0)});
740 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
756 Value *ExclScan =
nullptr;
757 Value *NewV =
nullptr;
759 const bool NeedResult = !
I.use_empty();
769 V =
B.CreateBitCast(V, IntNTy);
770 Identity =
B.CreateBitCast(Identity, IntNTy);
771 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
773 NewV =
B.CreateBitCast(NewV, Ty);
774 V =
B.CreateBitCast(V, Ty);
775 Identity =
B.CreateBitCast(Identity, Ty);
776 if (!NeedResult &&
ST->hasPermLaneX16()) {
780 NewV = buildReduction(
B, ScanOp, NewV, Identity);
782 NewV = buildScan(
B, ScanOp, NewV, Identity);
784 ExclScan = buildShiftRight(
B, NewV, Identity);
788 Value *
const LastLaneIdx =
B.getInt32(
ST->getWavefrontSize() - 1);
790 NewV =
B.CreateBitCast(NewV, IntNTy);
791 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
792 {NewV, LastLaneIdx});
793 NewV =
B.CreateBitCast(NewV, Ty);
796 NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
801 std::tie(ExclScan, NewV) = buildScanIteratively(
B, ScanOp, Identity, V,
I,
802 ComputeLoop, ComputeEnd);
815 Value *
const Ctpop =
B.CreateIntCast(
816 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
822 Value *
const Ctpop =
B.CreateIntCast(
823 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot),
Int32Ty,
false);
824 Value *
const CtpopFP =
B.CreateUIToFP(Ctpop, Ty);
825 NewV =
B.CreateFMul(V, CtpopFP);
844 Value *
const Ctpop =
B.CreateIntCast(
845 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty,
false);
854 Value *
const Cond =
B.CreateICmpEQ(Mbcnt,
B.getInt32(0));
879 B.SetInsertPoint(ComputeEnd);
881 B.Insert(Terminator);
885 B.SetInsertPoint(EntryBB);
886 B.CreateBr(ComputeLoop);
894 Predecessor = ComputeEnd;
896 Predecessor = EntryBB;
899 B.SetInsertPoint(SingleLaneTerminator);
909 B.SetInsertPoint(&
I);
915 PHI->addIncoming(NewI, SingleLaneTerminator->
getParent());
920 Value *BroadcastI =
nullptr;
922 if (TyBitWidth == 64) {
926 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
928 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
929 Value *
const PartialInsert =
B.CreateInsertElement(
932 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi,
B.getInt32(1));
933 BroadcastI =
B.CreateBitCast(Insert, Ty);
934 }
else if (TyBitWidth == 32) {
935 Value *CastedPhi =
B.CreateBitCast(
PHI, IntNTy);
937 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
938 BroadcastI =
B.CreateBitCast(BroadcastI, Ty);
948 Value *LaneOffset =
nullptr;
952 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
954 LaneOffset = ExclScan;
959 Mbcnt = isAtomicFloatingPointTy ?
B.CreateUIToFP(Mbcnt, Ty)
960 :
B.CreateIntCast(Mbcnt, Ty,
false);
976 LaneOffset =
B.CreateSelect(
Cond, Identity, V);
979 LaneOffset =
buildMul(
B, V,
B.CreateAnd(Mbcnt, 1));
983 LaneOffset =
B.CreateFMul(V, Mbcnt);
996 PHI->addIncoming(Result,
I.getParent());
997 I.replaceAllUsesWith(
PHI);
1000 I.replaceAllUsesWith(Result);
1005 I.eraseFromParent();
1009 "AMDGPU atomic optimizations",
false,
false)
1016 return new AMDGPUAtomicOptimizer(ScanStrategy);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Constant * getIdentityValueForAtomicOp(Type *const Ty, AtomicRMWInst::BinOp Op)
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
AMD GCN specific subclass of TargetSubtarget.
Generic memory optimizations
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
an instruction that atomically reads a memory location, combines it with another value,...
static bool isFPOperation(BinOp Op)
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
LLVM Basic Block Representation.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
This is the shared class of boolean and integer constants.
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
This is an important base class in LLVM.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
static constexpr UpdateKind Delete
static constexpr UpdateKind Insert
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitAtomicRMWInst(AtomicRMWInst &I)
const BasicBlock * getParent() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
void setOperand(unsigned i, Value *Val)
LLVM Value Representation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
DWARFExpression::Operation Op
constexpr unsigned BitWidth
char & AMDGPUAtomicOptimizerID
Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)