23#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-late-codegenprepare"
38 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
39 cl::desc(
"Widen sub-dword constant address space loads in "
40 "AMDGPULateCodeGenPrepare"),
45class AMDGPULateCodeGenPrepare
46 :
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
59 :
F(
F),
DL(
F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
61 bool visitInstruction(Instruction &) {
return false; }
64 bool isDWORDAligned(
const Value *V)
const {
69 bool canWidenScalarExtLoad(LoadInst &LI)
const;
70 bool visitLoadInst(LoadInst &LI);
75class LiveRegOptimizer {
79 const GCNSubtarget &ST;
82 Type *
const ConvertToScalar;
86 DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
91 Type *calculateConvertType(
Type *OriginalType);
98 Value *convertFromOptType(
Type *ConvertType, Instruction *V,
100 BasicBlock *InsertBlock);
104 bool optimizeLiveType(Instruction *
I,
105 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
109 bool shouldReplace(
Type *ITy) {
114 const auto *TLI = ST.getTargetLowering();
131 bool isCoercionProfitable(Instruction *
II) {
132 SmallPtrSet<Instruction *, 4> CVisited;
133 SmallVector<Instruction *, 4> UserList;
137 for (User *V :
II->users())
143 return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
144 return isa<PHINode, ShuffleVectorInst, InsertElementInst,
145 ExtractElementInst, CastInst>(
II);
148 while (!UserList.
empty()) {
150 if (!CVisited.
insert(CII).second)
155 if (CII->getParent() ==
II->getParent() && !IsLookThru(CII) &&
163 for (User *V : CII->users())
170 LiveRegOptimizer(
Module &Mod,
const GCNSubtarget &ST)
171 : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
177bool AMDGPULateCodeGenPrepare::run() {
185 LiveRegOptimizer LRO(*
F.getParent(), ST);
189 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
194 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
201Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
207 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
208 TypeSize ConvertScalarSize =
DL.getTypeSizeInBits(ConvertToScalar);
209 unsigned ConvertEltCount =
210 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
212 if (OriginalSize <= ConvertScalarSize)
215 return VectorType::get(Type::getIntNTy(
Mod.getContext(), ConvertScalarSize),
216 ConvertEltCount,
false);
219Value *LiveRegOptimizer::convertToOptType(Instruction *V,
222 Type *NewTy = calculateConvertType(
V->getType());
224 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
225 TypeSize NewSize =
DL.getTypeSizeInBits(NewTy);
230 if (OriginalSize == NewSize)
231 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
234 assert(NewSize > OriginalSize);
237 SmallVector<int, 8> ShuffleMask;
239 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
242 for (uint64_t
I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
243 ShuffleMask.
push_back(OriginalElementCount);
245 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
246 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
249Value *LiveRegOptimizer::convertFromOptType(
Type *ConvertType, Instruction *V,
251 BasicBlock *InsertBB) {
254 TypeSize OriginalSize =
DL.getTypeSizeInBits(
V->getType());
255 TypeSize NewSize =
DL.getTypeSizeInBits(NewVTy);
259 if (OriginalSize == NewSize)
260 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
264 assert(OriginalSize > NewSize);
266 if (!
V->getType()->isVectorTy()) {
281 SmallVector<int, 8> ShuffleMask(NarrowElementCount);
282 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
284 return Builder.CreateShuffleVector(Converted, ShuffleMask);
287bool LiveRegOptimizer::optimizeLiveType(
288 Instruction *
I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
289 SmallVector<Instruction *, 4> Worklist;
290 SmallPtrSet<PHINode *, 4> PhiNodes;
291 SmallPtrSet<Instruction *, 4> Defs;
292 SmallPtrSet<Instruction *, 4>
Uses;
293 SmallPtrSet<Instruction *, 4> Visited;
296 while (!Worklist.
empty()) {
302 if (!shouldReplace(
II->getType()))
305 if (!isCoercionProfitable(
II))
311 for (
Value *V :
Phi->incoming_values()) {
314 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
331 for (User *V :
II->users()) {
334 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
342 Uses.insert(UseInst);
350 for (Instruction *
D : Defs) {
353 Value *ConvertVal = convertToOptType(
D, InsertPt);
355 ValMap[
D] = ConvertVal;
360 for (PHINode *Phi : PhiNodes) {
362 Phi->getNumIncomingValues(),
363 Phi->getName() +
".tc",
Phi->getIterator());
367 for (PHINode *Phi : PhiNodes) {
369 bool MissingIncVal =
false;
370 for (
int I = 0,
E =
Phi->getNumIncomingValues();
I <
E;
I++) {
371 Value *IncVal =
Phi->getIncomingValue(
I);
373 Type *NewType = calculateConvertType(
Phi->getType());
374 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
375 Phi->getIncomingBlock(
I));
379 MissingIncVal =
true;
386 SmallPtrSet<Value *, 4> VisitedPhis;
388 while (!PHIWorklist.
empty()) {
390 VisitedPhis.
insert(NextDeadValue);
392 llvm::find_if(PhiNodes, [
this, &NextDeadValue](PHINode *CandPhi) {
393 return ValMap[CandPhi] == NextDeadValue;
397 if (OriginalPhi != PhiNodes.end())
398 ValMap.
erase(*OriginalPhi);
402 for (User *U : NextDeadValue->
users()) {
412 for (Instruction *U :
Uses) {
416 Value *NewVal =
nullptr;
417 if (BBUseValMap.
contains(
U->getParent()) &&
418 BBUseValMap[
U->getParent()].contains(Val))
419 NewVal = BBUseValMap[
U->getParent()][Val];
431 InsertPt,
U->getParent());
432 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
436 U->setOperand(
OpIdx, NewVal);
444bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI)
const {
457 unsigned TySize =
DL.getTypeStoreSize(Ty);
468bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
477 if (!canWidenScalarExtLoad(LI))
485 if (!isDWORDAligned(
Base))
488 int64_t Adjust =
Offset & 0x3;
499 unsigned LdBits =
DL.getTypeStoreSizeInBits(LI.
getType());
500 auto *IntNTy = Type::getIntNTy(LI.
getContext(), LdBits);
502 auto *NewPtr = IRB.CreateConstGEP1_64(
507 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
509 NewLd->
setMetadata(LLVMContext::MD_range,
nullptr);
511 unsigned ShAmt = Adjust * 8;
512 Value *NewVal = IRB.CreateBitCast(
513 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt),
514 DL.typeSizeEqualsStoreSize(LI.
getType()) ? IntNTy
529 bool Changed = AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
545 return "AMDGPU IR late optimizations";
572 return AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
576 "AMDGPU IR late optimizations",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
Machine Check Debug Module
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPULateCodeGenPrepareLegacy()
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
bool erase(const KeyT &Val)
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
FunctionPass class - This class is used to implement most global optimizations.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Base class for instruction visitors.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
DenseMap< const Value *, Value * > ValueToValueMap
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.