Go to the documentation of this file.
47 #define DEBUG_TYPE "hardware-loops"
49 #define HW_LOOPS_NAME "Hardware Loop Insertion"
55 cl::desc(
"Force hardware loops intrinsics to be inserted"));
60 cl::desc(
"Force hardware loop counter to be updated through a phi"));
64 cl::desc(
"Force allowance of nested hardware loops"));
68 cl::desc(
"Set the loop decrement value"));
72 cl::desc(
"Set the loop counter bitwidth"));
77 cl::desc(
"Force generation of loop guard intrinsic"));
79 STATISTIC(NumHWLoops,
"Number of loops converted to hardware loops");
84 dbgs() <<
"HWLoops: " << DebugMsg;
99 CodeRegion =
I->getParent();
102 if (
I->getDebugLoc())
103 DL =
I->getDebugLoc();
107 R <<
"hardware-loop not created: ";
143 bool TryConvertLoop(
Loop *L);
156 bool PreserveLCSSA =
false;
160 bool MadeChange =
false;
165 Value *InitLoopCount();
168 Value *InsertIterationSetup(
Value *LoopCountInit);
171 void InsertLoopDec();
183 void UpdateBranch(
Value *EltsRem);
189 SE(SE),
DL(
DL), ORE(ORE), L(
Info.L),
M(L->getHeader()->getModule()),
190 TripCount(
Info.TripCount),
191 CountType(
Info.CountType),
192 ExitBranch(
Info.ExitBranch),
194 UsePHICounter(
Info.CounterInReg),
195 UseLoopGuard(
Info.PerformEntryTest) { }
205 const SCEV *TripCount =
nullptr;
206 Type *CountType =
nullptr;
209 bool UsePHICounter =
false;
210 bool UseLoopGuard =
false;
223 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
224 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
225 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
226 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
227 DL = &
F.getParent()->getDataLayout();
228 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
229 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
230 LibInfo = TLIP ? &TLIP->getTLI(
F) :
nullptr;
231 PreserveLCSSA = mustPreserveAnalysisID(
LCSSAID);
232 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
236 if (L->isOutermost())
244 bool HardwareLoops::TryConvertLoop(
Loop *L) {
246 bool AnyChanged =
false;
248 AnyChanged |= TryConvertLoop(SL);
250 reportHWLoopFailure(
"nested hardware-loops not supported",
"HWLoopNested",
255 LLVM_DEBUG(
dbgs() <<
"HWLoops: Loop " << L->getHeader()->getName() <<
"\n");
258 if (!HWLoopInfo.canAnalyze(*LI)) {
259 reportHWLoopFailure(
"cannot analyze loop, irreducible control flow",
260 "HWLoopCannotAnalyze", ORE, L);
266 reportHWLoopFailure(
"it's not profitable to create a hardware-loop",
267 "HWLoopNotProfitable", ORE, L);
273 HWLoopInfo.CountType =
277 HWLoopInfo.LoopDecrement =
280 MadeChange |= TryConvertLoop(HWLoopInfo);
286 Loop *L = HWLoopInfo.
L;
287 LLVM_DEBUG(
dbgs() <<
"HWLoops: Try to convert profitable loop: " << *L);
294 reportHWLoopFailure(
"loop is not a candidate",
"HWLoopNoCandidate", ORE, L);
300 "Hardware Loop must have set exit info.");
310 HardwareLoop HWLoop(HWLoopInfo, *SE, *
DL, ORE);
316 void HardwareLoop::Create() {
319 Value *LoopCountInit = InitLoopCount();
320 if (!LoopCountInit) {
321 reportHWLoopFailure(
"could not safely create a loop count expression",
322 "HWLoopNotSafe", ORE, L);
326 Value *Setup = InsertIterationSetup(LoopCountInit);
329 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
330 Value *EltsRem = InsertPHICounter(Setup, LoopDec);
332 UpdateBranch(LoopDec);
352 if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
357 auto ICmp = cast<ICmpInst>(BI->getCondition());
359 if (!ICmp->isEquality())
362 auto IsCompareZero = [](
ICmpInst *ICmp,
Value *Count,
unsigned OpIdx) {
363 if (
auto *Const = dyn_cast<ConstantInt>(ICmp->
getOperand(OpIdx)))
364 return Const->isZero() && ICmp->
getOperand(OpIdx ^ 1) == Count;
368 if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1))
372 if (BI->getSuccessor(SuccIdx) != Preheader)
378 Value *HardwareLoop::InitLoopCount() {
379 LLVM_DEBUG(
dbgs() <<
"HWLoops: Initialising loop counter value:\n");
390 SE.getZero(TripCount->getType()))) {
394 UseLoopGuard =
false;
397 if (UseLoopGuard &&
BB->getSinglePredecessor() &&
398 cast<BranchInst>(
BB->getTerminator())->isUnconditional()) {
403 UseLoopGuard =
false;
410 << *TripCount <<
"\n");
414 Value *Count = SCEVE.expandCodeFor(TripCount, CountType,
415 BB->getTerminator());
427 <<
" - Expanded Count in " <<
BB->getName() <<
"\n"
428 <<
" - Will insert set counter intrinsic into: "
429 << BeginBB->getName() <<
"\n");
433 Value* HardwareLoop::InsertIterationSetup(
Value *LoopCountInit) {
438 ? (UsePhi ? Intrinsic::test_start_loop_iterations
439 : Intrinsic::test_set_loop_iterations)
440 : (UsePhi ? Intrinsic::start_loop_iterations
441 : Intrinsic::set_loop_iterations);
443 Value *LoopSetup =
Builder.CreateCall(LoopIter, LoopCountInit);
447 assert((isa<BranchInst>(BeginBB->getTerminator()) &&
448 cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
449 "Expected conditional branch");
452 UsePhi ?
Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
453 auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
454 LoopGuard->setCondition(SetCount);
456 LoopGuard->swapSuccessors();
458 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop counter: " << *LoopSetup
460 if (UsePhi && UseLoopGuard)
461 LoopSetup =
Builder.CreateExtractValue(LoopSetup, 0);
462 return !UsePhi ? LoopCountInit : LoopSetup;
465 void HardwareLoop::InsertLoopDec() {
472 Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
473 Value *OldCond = ExitBranch->getCondition();
474 ExitBranch->setCondition(NewCond);
477 if (!L->
contains(ExitBranch->getSuccessor(0)))
478 ExitBranch->swapSuccessors();
484 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *NewCond <<
"\n");
494 Value *
Call = CondBuilder.CreateCall(DecFunc, Ops);
496 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *Call <<
"\n");
497 return cast<Instruction>(Call);
506 Index->addIncoming(NumElts, Preheader);
507 Index->addIncoming(EltsRem, Latch);
512 void HardwareLoop::UpdateBranch(
Value *EltsRem) {
516 Value *OldCond = ExitBranch->getCondition();
517 ExitBranch->setCondition(NewCond);
520 if (!L->
contains(ExitBranch->getSuccessor(0)))
521 ExitBranch->swapSuccessors();
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
static cl::opt< unsigned > CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth"))
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
A parsed version of the target data layout string in and methods for querying it.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
const Function * getParent() const
Return the enclosing method, or null if none.
Represents a single loop in the control flow graph.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
This class uses information about analyze scalars to rewrite expressions in canonical form.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
The main scalar evolution driver.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
The instances of the Type class are immutable: once they are created, they are never changed.
The legacy pass manager's analysis pass to compute loop information.
static cl::opt< bool > ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), cl::desc("Force allowance of nested hardware loops"))
static cl::opt< unsigned > LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), cl::desc("Set the loop decrement value"))
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
static OptimizationRemarkAnalysis createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
LLVM Basic Block Representation.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Represent the analysis usage information of a pass.
iterator_range< block_iterator > blocks() const
Legacy analysis pass which computes a DominatorTree.
STATISTIC(NumFunctions, "Total number of functions")
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Analysis containing CSE Info
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static cl::opt< bool > ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false), cl::desc("Force hardware loops intrinsics to be inserted"))
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
static void debugHWLoopFailure(const StringRef DebugMsg, Instruction *I)
This class represents an analyzed expression in the program.
This instruction compares its operands according to the predicate given to the constructor.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
void initializeHardwareLoopsPass(PassRegistry &)
initializer< Ty > init(const Ty &Val)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
static cl::opt< bool > ForceGuardLoopEntry("force-hardware-loop-guard", cl::Hidden, cl::init(false), cl::desc("Force generation of loop guard intrinsic"))
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool CanGenerateTest(Loop *L, Value *Count)
A Module instance is used to store all the information related to an LLVM module.
void setOperand(unsigned i, Value *Val)
An immutable pass that tracks lazily created AssumptionCache objects.
StringRef - Represent a constant reference to a string, i.e.
FunctionPass * createHardwareLoopsPass()
Create Hardware Loop pass.
A cache of @llvm.assume calls within a function.
Type * getType() const
All values are typed, get the type of this value.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static bool runOnFunction(Function &F, bool PostInlining)
BlockT * getHeader() const
Provides information about what library functions are available for the current target.
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
Attributes of a target dependent hardware loop.
bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are d...
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Predicate getPredicate() const
Return the predicate for this instruction.
static cl::opt< bool > ForceHardwareLoopPHI("force-hardware-loop-phi", cl::Hidden, cl::init(false), cl::desc("Force hardware loop counter to be updated through a phi"))
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
FunctionPass class - This class is used to implement most global optimizations.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
AnalysisUsage & addRequired()
Value * getOperand(unsigned i) const
Conditional or Unconditional Branch instruction.
LLVM Value Representation.