LLVM 18.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
17#include "llvm/IR/IntrinsicsPowerPC.h"
20#include "llvm/Support/Debug.h"
24#include <optional>
25
26using namespace llvm;
27
28#define DEBUG_TYPE "ppctti"
29
30static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
31cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
32
33static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
34cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
35
36static cl::opt<bool>
37EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
38 cl::desc("Enable using coldcc calling conv for cold "
39 "internal functions"));
40
41static cl::opt<bool>
42LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
43 cl::desc("Do not add instruction count to lsr cost model"));
44
45// The latency of mtctr is only justified if there are more than 4
46// comparisons that will be removed as a result.
48SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
49 cl::desc("Loops with a constant trip count smaller than "
50 "this value will not use the count register."));
51
52//===----------------------------------------------------------------------===//
53//
54// PPC cost model.
55//
56//===----------------------------------------------------------------------===//
57
60 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
61 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
64 return TTI::PSK_Software;
65}
66
67std::optional<Instruction *>
70 switch (IID) {
71 default:
72 break;
73 case Intrinsic::ppc_altivec_lvx:
74 case Intrinsic::ppc_altivec_lvxl:
75 // Turn PPC lvx -> load if the pointer is known aligned.
77 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
78 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
79 Value *Ptr = II.getArgOperand(0);
80 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
81 }
82 break;
83 case Intrinsic::ppc_vsx_lxvw4x:
84 case Intrinsic::ppc_vsx_lxvd2x: {
85 // Turn PPC VSX loads into normal loads.
86 Value *Ptr = II.getArgOperand(0);
87 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
88 }
89 case Intrinsic::ppc_altivec_stvx:
90 case Intrinsic::ppc_altivec_stvxl:
91 // Turn stvx -> store if the pointer is known aligned.
93 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
94 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
95 Value *Ptr = II.getArgOperand(1);
96 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
97 }
98 break;
99 case Intrinsic::ppc_vsx_stxvw4x:
100 case Intrinsic::ppc_vsx_stxvd2x: {
101 // Turn PPC VSX stores into normal stores.
102 Value *Ptr = II.getArgOperand(1);
103 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
104 }
105 case Intrinsic::ppc_altivec_vperm:
106 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
107 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
108 // a vectorshuffle for little endian, we must undo the transformation
109 // performed on vec_perm in altivec.h. That is, we must complement
110 // the permutation mask with respect to 31 and reverse the order of
111 // V1 and V2.
112 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
113 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
114 "Bad type for intrinsic!");
115
116 // Check that all of the elements are integer constants or undefs.
117 bool AllEltsOk = true;
118 for (unsigned i = 0; i != 16; ++i) {
119 Constant *Elt = Mask->getAggregateElement(i);
120 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
121 AllEltsOk = false;
122 break;
123 }
124 }
125
126 if (AllEltsOk) {
127 // Cast the input vectors to byte vectors.
128 Value *Op0 =
129 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
130 Value *Op1 =
131 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
132 Value *Result = UndefValue::get(Op0->getType());
133
134 // Only extract each element once.
135 Value *ExtractedElts[32];
136 memset(ExtractedElts, 0, sizeof(ExtractedElts));
137
138 for (unsigned i = 0; i != 16; ++i) {
139 if (isa<UndefValue>(Mask->getAggregateElement(i)))
140 continue;
141 unsigned Idx =
142 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
143 Idx &= 31; // Match the hardware behavior.
144 if (DL.isLittleEndian())
145 Idx = 31 - Idx;
146
147 if (!ExtractedElts[Idx]) {
148 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
149 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
150 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
151 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
152 }
153
154 // Insert this value into the result vector.
155 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
156 IC.Builder.getInt32(i));
157 }
158 return CastInst::Create(Instruction::BitCast, Result, II.getType());
159 }
160 }
161 break;
162 }
163 return std::nullopt;
164}
165
169 return BaseT::getIntImmCost(Imm, Ty, CostKind);
170
171 assert(Ty->isIntegerTy());
172
173 unsigned BitSize = Ty->getPrimitiveSizeInBits();
174 if (BitSize == 0)
175 return ~0U;
176
177 if (Imm == 0)
178 return TTI::TCC_Free;
179
180 if (Imm.getBitWidth() <= 64) {
181 if (isInt<16>(Imm.getSExtValue()))
182 return TTI::TCC_Basic;
183
184 if (isInt<32>(Imm.getSExtValue())) {
185 // A constant that can be materialized using lis.
186 if ((Imm.getZExtValue() & 0xFFFF) == 0)
187 return TTI::TCC_Basic;
188
189 return 2 * TTI::TCC_Basic;
190 }
191 }
192
193 return 4 * TTI::TCC_Basic;
194}
195
197 const APInt &Imm, Type *Ty,
200 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
201
202 assert(Ty->isIntegerTy());
203
204 unsigned BitSize = Ty->getPrimitiveSizeInBits();
205 if (BitSize == 0)
206 return ~0U;
207
208 switch (IID) {
209 default:
210 return TTI::TCC_Free;
211 case Intrinsic::sadd_with_overflow:
212 case Intrinsic::uadd_with_overflow:
213 case Intrinsic::ssub_with_overflow:
214 case Intrinsic::usub_with_overflow:
215 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
216 return TTI::TCC_Free;
217 break;
218 case Intrinsic::experimental_stackmap:
219 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
220 return TTI::TCC_Free;
221 break;
222 case Intrinsic::experimental_patchpoint_void:
223 case Intrinsic::experimental_patchpoint_i64:
224 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
225 return TTI::TCC_Free;
226 break;
227 }
228 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
229}
230
232 const APInt &Imm, Type *Ty,
234 Instruction *Inst) {
236 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
237
238 assert(Ty->isIntegerTy());
239
240 unsigned BitSize = Ty->getPrimitiveSizeInBits();
241 if (BitSize == 0)
242 return ~0U;
243
244 unsigned ImmIdx = ~0U;
245 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
246 ZeroFree = false;
247 switch (Opcode) {
248 default:
249 return TTI::TCC_Free;
250 case Instruction::GetElementPtr:
251 // Always hoist the base address of a GetElementPtr. This prevents the
252 // creation of new constants for every base constant that gets constant
253 // folded with the offset.
254 if (Idx == 0)
255 return 2 * TTI::TCC_Basic;
256 return TTI::TCC_Free;
257 case Instruction::And:
258 RunFree = true; // (for the rotate-and-mask instructions)
259 [[fallthrough]];
260 case Instruction::Add:
261 case Instruction::Or:
262 case Instruction::Xor:
263 ShiftedFree = true;
264 [[fallthrough]];
265 case Instruction::Sub:
266 case Instruction::Mul:
267 case Instruction::Shl:
268 case Instruction::LShr:
269 case Instruction::AShr:
270 ImmIdx = 1;
271 break;
272 case Instruction::ICmp:
273 UnsignedFree = true;
274 ImmIdx = 1;
275 // Zero comparisons can use record-form instructions.
276 [[fallthrough]];
277 case Instruction::Select:
278 ZeroFree = true;
279 break;
280 case Instruction::PHI:
281 case Instruction::Call:
282 case Instruction::Ret:
283 case Instruction::Load:
284 case Instruction::Store:
285 break;
286 }
287
288 if (ZeroFree && Imm == 0)
289 return TTI::TCC_Free;
290
291 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
292 if (isInt<16>(Imm.getSExtValue()))
293 return TTI::TCC_Free;
294
295 if (RunFree) {
296 if (Imm.getBitWidth() <= 32 &&
297 (isShiftedMask_32(Imm.getZExtValue()) ||
298 isShiftedMask_32(~Imm.getZExtValue())))
299 return TTI::TCC_Free;
300
301 if (ST->isPPC64() &&
302 (isShiftedMask_64(Imm.getZExtValue()) ||
303 isShiftedMask_64(~Imm.getZExtValue())))
304 return TTI::TCC_Free;
305 }
306
307 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
308 return TTI::TCC_Free;
309
310 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
311 return TTI::TCC_Free;
312 }
313
314 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
315}
316
317// Check if the current Type is an MMA vector type. Valid MMA types are
318// v256i1 and v512i1 respectively.
319static bool isMMAType(Type *Ty) {
320 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
321 (Ty->getPrimitiveSizeInBits() > 128);
322}
323
327 // We already implement getCastInstrCost and getMemoryOpCost where we perform
328 // the vector adjustment there.
329 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
331
332 if (U->getType()->isVectorTy()) {
333 // Instructions that need to be split should cost more.
334 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
335 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
336 }
337
339}
340
342 AssumptionCache &AC,
343 TargetLibraryInfo *LibInfo,
344 HardwareLoopInfo &HWLoopInfo) {
345 const PPCTargetMachine &TM = ST->getTargetMachine();
346 TargetSchedModel SchedModel;
347 SchedModel.init(ST);
348
349 // Do not convert small short loops to CTR loop.
350 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
351 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
353 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
355 for (BasicBlock *BB : L->blocks())
356 Metrics.analyzeBasicBlock(BB, *this, EphValues);
357 // 6 is an approximate latency for the mtctr instruction.
358 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
359 return false;
360 }
361
362 // Check that there is no hardware loop related intrinsics in the loop.
363 for (auto *BB : L->getBlocks())
364 for (auto &I : *BB)
365 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
366 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
367 Call->getIntrinsicID() == Intrinsic::loop_decrement)
368 return false;
369
370 SmallVector<BasicBlock*, 4> ExitingBlocks;
371 L->getExitingBlocks(ExitingBlocks);
372
373 // If there is an exit edge known to be frequently taken,
374 // we should not transform this loop.
375 for (auto &BB : ExitingBlocks) {
376 Instruction *TI = BB->getTerminator();
377 if (!TI) continue;
378
379 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
380 uint64_t TrueWeight = 0, FalseWeight = 0;
381 if (!BI->isConditional() ||
382 !extractBranchWeights(*BI, TrueWeight, FalseWeight))
383 continue;
384
385 // If the exit path is more frequent than the loop path,
386 // we return here without further analysis for this loop.
387 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
388 if (( TrueIsExit && FalseWeight < TrueWeight) ||
389 (!TrueIsExit && FalseWeight > TrueWeight))
390 return false;
391 }
392 }
393
394 LLVMContext &C = L->getHeader()->getContext();
395 HWLoopInfo.CountType = TM.isPPC64() ?
397 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
398 return true;
399}
400
404 if (ST->getCPUDirective() == PPC::DIR_A2) {
405 // The A2 is in-order with a deep pipeline, and concatenation unrolling
406 // helps expose latency-hiding opportunities to the instruction scheduler.
407 UP.Partial = UP.Runtime = true;
408
409 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
410 // often outweigh the cost of a division to compute the trip count.
411 UP.AllowExpensiveTripCount = true;
412 }
413
414 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
415}
416
420}
421// This function returns true to allow using coldcc calling convention.
422// Returning true results in coldcc being used for functions which are cold at
423// all call sites when the callers of the functions are not calling any other
424// non coldcc functions.
426 return EnablePPCColdCC;
427}
428
429bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
430 // On the A2, always unroll aggressively.
431 if (ST->getCPUDirective() == PPC::DIR_A2)
432 return true;
433
434 return LoopHasReductions;
435}
436
438PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
440 Options.LoadSizes = {8, 4, 2, 1};
441 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
442 return Options;
443}
444
446 return true;
447}
448
449unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
450 assert(ClassID == GPRRC || ClassID == FPRRC ||
451 ClassID == VRRC || ClassID == VSXRC);
452 if (ST->hasVSX()) {
453 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
454 return ClassID == VSXRC ? 64 : 32;
455 }
456 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
457 return 32;
458}
459
461 if (Vector)
462 return ST->hasVSX() ? VSXRC : VRRC;
463 else if (Ty && (Ty->getScalarType()->isFloatTy() ||
464 Ty->getScalarType()->isDoubleTy()))
465 return ST->hasVSX() ? VSXRC : FPRRC;
466 else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
468 return VRRC;
469 else if (Ty && Ty->getScalarType()->isHalfTy())
470 return VSXRC;
471 else
472 return GPRRC;
473}
474
475const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
476
477 switch (ClassID) {
478 default:
479 llvm_unreachable("unknown register class");
480 return "PPC::unknown register class";
481 case GPRRC: return "PPC::GPRRC";
482 case FPRRC: return "PPC::FPRRC";
483 case VRRC: return "PPC::VRRC";
484 case VSXRC: return "PPC::VSXRC";
485 }
486}
487
490 switch (K) {
492 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
494 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
496 return TypeSize::getScalable(0);
497 }
498
499 llvm_unreachable("Unsupported register kind");
500}
501
503 // Starting with P7 we have a cache line size of 128.
504 unsigned Directive = ST->getCPUDirective();
505 // Assume that Future CPU has the same cache line size as the others.
509 return 128;
510
511 // On other processors return a default of 64 bytes.
512 return 64;
513}
514
516 return 300;
517}
518
520 unsigned Directive = ST->getCPUDirective();
521 // The 440 has no SIMD support, but floating-point instructions
522 // have a 5-cycle latency, so unroll by 5x for latency hiding.
523 if (Directive == PPC::DIR_440)
524 return 5;
525
526 // The A2 has no SIMD support, but floating-point instructions
527 // have a 6-cycle latency, so unroll by 6x for latency hiding.
528 if (Directive == PPC::DIR_A2)
529 return 6;
530
531 // FIXME: For lack of any better information, do no harm...
533 return 1;
534
535 // For P7 and P8, floating-point instructions have a 6-cycle latency and
536 // there are two execution units, so unroll by 12x for latency hiding.
537 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
538 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
539 // Assume that future is the same as the others.
543 return 12;
544
545 // For most things, modern systems have two execution units (and
546 // out-of-order execution).
547 return 2;
548}
549
550// Returns a cost adjustment factor to adjust the cost of vector instructions
551// on targets which there is overlap between the vector and scalar units,
552// thereby reducing the overall throughput of vector code wrt. scalar code.
553// An invalid instruction cost is returned if the type is an MMA vector type.
555 Type *Ty1, Type *Ty2) {
556 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
557 // instruction cost is returned. This is to signify to other cost computing
558 // functions to return the maximum instruction cost in order to prevent any
559 // opportunities for the optimizer to produce MMA types within the IR.
560 if (isMMAType(Ty1))
562
563 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
564 return InstructionCost(1);
565
566 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
567 // If type legalization involves splitting the vector, we don't want to
568 // double the cost at every step - only the last step.
569 if (LT1.first != 1 || !LT1.second.isVector())
570 return InstructionCost(1);
571
572 int ISD = TLI->InstructionOpcodeToISD(Opcode);
573 if (TLI->isOperationExpand(ISD, LT1.second))
574 return InstructionCost(1);
575
576 if (Ty2) {
577 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
578 if (LT2.first != 1 || !LT2.second.isVector())
579 return InstructionCost(1);
580 }
581
582 return InstructionCost(2);
583}
584
589 const Instruction *CxtI) {
590 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
591
592 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
593 if (!CostFactor.isValid())
595
596 // TODO: Handle more cost kinds.
598 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
599 Op2Info, Args, CxtI);
600
601 // Fallback to the default implementation.
603 Opcode, Ty, CostKind, Op1Info, Op2Info);
604 return Cost * CostFactor;
605}
606
608 ArrayRef<int> Mask,
610 int Index, Type *SubTp,
612
613 InstructionCost CostFactor =
614 vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
615 if (!CostFactor.isValid())
617
618 // Legalize the type.
619 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
620
621 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
622 // (at least in the sense that there need only be one non-loop-invariant
623 // instruction). We need one such shuffle instruction for each actual
624 // register (this is not true for arbitrary shuffles, but is true for the
625 // structured types of shuffles covered by TTI::ShuffleKind).
626 return LT.first * CostFactor;
627}
628
631 const Instruction *I) {
633 return Opcode == Instruction::PHI ? 0 : 1;
634 // Branches are assumed to be predicted.
635 return 0;
636}
637
639 Type *Src,
642 const Instruction *I) {
643 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
644
645 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
646 if (!CostFactor.isValid())
648
650 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
651 Cost *= CostFactor;
652 // TODO: Allow non-throughput costs that aren't binary.
654 return Cost == 0 ? 0 : 1;
655 return Cost;
656}
657
659 Type *CondTy,
660 CmpInst::Predicate VecPred,
662 const Instruction *I) {
663 InstructionCost CostFactor =
664 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
665 if (!CostFactor.isValid())
667
669 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
670 // TODO: Handle other cost kinds.
672 return Cost;
673 return Cost * CostFactor;
674}
675
678 unsigned Index, Value *Op0,
679 Value *Op1) {
680 assert(Val->isVectorTy() && "This must be a vector type");
681
682 int ISD = TLI->InstructionOpcodeToISD(Opcode);
683 assert(ISD && "Invalid opcode");
684
685 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
686 if (!CostFactor.isValid())
688
691 Cost *= CostFactor;
692
693 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
694 // Double-precision scalars are already located in index #0 (or #1 if LE).
695 if (ISD == ISD::EXTRACT_VECTOR_ELT &&
696 Index == (ST->isLittleEndian() ? 1 : 0))
697 return 0;
698
699 return Cost;
700
701 } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
702 unsigned EltSize = Val->getScalarSizeInBits();
703 // Computing on 1 bit values requires extra mask or compare operations.
704 unsigned MaskCost = VecMaskCost && EltSize == 1 ? 1 : 0;
705 if (ST->hasP9Altivec()) {
706 if (ISD == ISD::INSERT_VECTOR_ELT)
707 // A move-to VSR and a permute/insert. Assume vector operation cost
708 // for both (cost will be 2x on P9).
709 return 2 * CostFactor;
710
711 // It's an extract. Maybe we can do a cheap move-from VSR.
712 unsigned EltSize = Val->getScalarSizeInBits();
713 if (EltSize == 64) {
714 unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
715 if (Index == MfvsrdIndex)
716 return 1;
717 } else if (EltSize == 32) {
718 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
719 if (Index == MfvsrwzIndex)
720 return 1;
721 }
722
723 // We need a vector extract (or mfvsrld). Assume vector operation cost.
724 // The cost of the load constant for a vector extract is disregarded
725 // (invariant, easily schedulable).
726 return CostFactor + MaskCost;
727
728 } else if (ST->hasDirectMove()) {
729 // Assume permute has standard cost.
730 // Assume move-to/move-from VSR have 2x standard cost.
731 if (ISD == ISD::INSERT_VECTOR_ELT)
732 return 3;
733 return 3 + MaskCost;
734 }
735 }
736
737 // Estimated cost of a load-hit-store delay. This was obtained
738 // experimentally as a minimum needed to prevent unprofitable
739 // vectorization for the paq8p benchmark. It may need to be
740 // raised further if other unprofitable cases remain.
741 unsigned LHSPenalty = 2;
742 if (ISD == ISD::INSERT_VECTOR_ELT)
743 LHSPenalty += 7;
744
745 // Vector element insert/extract with Altivec is very expensive,
746 // because they require store and reload with the attendant
747 // processor stall for load-hit-store. Until VSX is available,
748 // these need to be estimated as very costly.
749 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
751 return LHSPenalty + Cost;
752
753 return Cost;
754}
755
757 MaybeAlign Alignment,
758 unsigned AddressSpace,
761 const Instruction *I) {
762
763 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
764 if (!CostFactor.isValid())
766
767 if (TLI->getValueType(DL, Src, true) == MVT::Other)
768 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
769 CostKind);
770 // Legalize the type.
771 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
772 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
773 "Invalid Opcode");
774
777 // TODO: Handle other cost kinds.
779 return Cost;
780
781 Cost *= CostFactor;
782
783 bool IsAltivecType = ST->hasAltivec() &&
784 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
785 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
786 bool IsVSXType = ST->hasVSX() &&
787 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
788
789 // VSX has 32b/64b load instructions. Legalization can handle loading of
790 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
791 // PPCTargetLowering can't compute the cost appropriately. So here we
792 // explicitly check this case.
793 unsigned MemBytes = Src->getPrimitiveSizeInBits();
794 if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
795 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
796 return 1;
797
798 // Aligned loads and stores are easy.
799 unsigned SrcBytes = LT.second.getStoreSize();
800 if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
801 return Cost;
802
803 // If we can use the permutation-based load sequence, then this is also
804 // relatively cheap (not counting loop-invariant instructions): one load plus
805 // one permute (the last load in a series has extra cost, but we're
806 // neglecting that here). Note that on the P7, we could do unaligned loads
807 // for Altivec types using the VSX instructions, but that's more expensive
808 // than using the permutation-based load sequence. On the P8, that's no
809 // longer true.
810 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
811 *Alignment >= LT.second.getScalarType().getStoreSize())
812 return Cost + LT.first; // Add the cost of the permutations.
813
814 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
815 // P7, unaligned vector loads are more expensive than the permutation-based
816 // load sequence, so that might be used instead, but regardless, the net cost
817 // is about the same (not counting loop-invariant instructions).
818 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
819 return Cost;
820
821 // Newer PPC supports unaligned memory access.
822 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
823 return Cost;
824
825 // PPC in general does not support unaligned loads and stores. They'll need
826 // to be decomposed based on the alignment factor.
827
828 // Add the cost of each scalar load or store.
829 assert(Alignment);
830 Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
831
832 // For a vector type, there is also scalarization overhead (only for
833 // stores, loads are expanded using the vector-load + permutation sequence,
834 // which is much less expensive).
835 if (Src->isVectorTy() && Opcode == Instruction::Store)
836 for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
837 ++i)
838 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
839 nullptr, nullptr);
840
841 return Cost;
842}
843
845 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
846 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
847 bool UseMaskForCond, bool UseMaskForGaps) {
848 InstructionCost CostFactor =
849 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
850 if (!CostFactor.isValid())
852
853 if (UseMaskForCond || UseMaskForGaps)
854 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
855 Alignment, AddressSpace, CostKind,
856 UseMaskForCond, UseMaskForGaps);
857
858 assert(isa<VectorType>(VecTy) &&
859 "Expect a vector type for interleaved memory op");
860
861 // Legalize the type.
862 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
863
864 // Firstly, the cost of load/store operation.
867
868 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
869 // (at least in the sense that there need only be one non-loop-invariant
870 // instruction). For each result vector, we need one shuffle per incoming
871 // vector (except that the first shuffle can take two incoming vectors
872 // because it does not need to take itself).
873 Cost += Factor*(LT.first-1);
874
875 return Cost;
876}
877
882}
883
885 const Function *Callee,
886 const ArrayRef<Type *> &Types) const {
887
888 // We need to ensure that argument promotion does not
889 // attempt to promote pointers to MMA types (__vector_pair
890 // and __vector_quad) since these types explicitly cannot be
891 // passed as arguments. Both of these types are larger than
892 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
893 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
894 return false;
895
896 return llvm::none_of(Types, [](Type *Ty) {
897 if (Ty->isSized())
898 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
899 return false;
900 });
901}
902
904 LoopInfo *LI, DominatorTree *DT,
905 AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
906 // Process nested loops first.
907 for (Loop *I : *L)
908 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
909 return false; // Stop search.
910
911 HardwareLoopInfo HWLoopInfo(L);
912
913 if (!HWLoopInfo.canAnalyze(*LI))
914 return false;
915
916 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
917 return false;
918
919 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
920 return false;
921
922 *BI = HWLoopInfo.ExitBranch;
923 return true;
924}
925
928 // PowerPC default behaviour here is "instruction number 1st priority".
929 // If LsrNoInsnsCost is set, call default implementation.
930 if (!LsrNoInsnsCost)
931 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
932 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
933 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
934 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
935 else
937}
938
940 return false;
941}
942
944 const PPCTargetMachine &TM = ST->getTargetMachine();
945 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
946 if (!TM.isELFv2ABI())
947 return false;
949}
950
952 MemIntrinsicInfo &Info) {
953 switch (Inst->getIntrinsicID()) {
954 case Intrinsic::ppc_altivec_lvx:
955 case Intrinsic::ppc_altivec_lvxl:
956 case Intrinsic::ppc_altivec_lvebx:
957 case Intrinsic::ppc_altivec_lvehx:
958 case Intrinsic::ppc_altivec_lvewx:
959 case Intrinsic::ppc_vsx_lxvd2x:
960 case Intrinsic::ppc_vsx_lxvw4x:
961 case Intrinsic::ppc_vsx_lxvd2x_be:
962 case Intrinsic::ppc_vsx_lxvw4x_be:
963 case Intrinsic::ppc_vsx_lxvl:
964 case Intrinsic::ppc_vsx_lxvll:
965 case Intrinsic::ppc_vsx_lxvp: {
966 Info.PtrVal = Inst->getArgOperand(0);
967 Info.ReadMem = true;
968 Info.WriteMem = false;
969 return true;
970 }
971 case Intrinsic::ppc_altivec_stvx:
972 case Intrinsic::ppc_altivec_stvxl:
973 case Intrinsic::ppc_altivec_stvebx:
974 case Intrinsic::ppc_altivec_stvehx:
975 case Intrinsic::ppc_altivec_stvewx:
976 case Intrinsic::ppc_vsx_stxvd2x:
977 case Intrinsic::ppc_vsx_stxvw4x:
978 case Intrinsic::ppc_vsx_stxvd2x_be:
979 case Intrinsic::ppc_vsx_stxvw4x_be:
980 case Intrinsic::ppc_vsx_stxvl:
981 case Intrinsic::ppc_vsx_stxvll:
982 case Intrinsic::ppc_vsx_stxvp: {
983 Info.PtrVal = Inst->getArgOperand(1);
984 Info.ReadMem = false;
985 Info.WriteMem = true;
986 return true;
987 }
988 case Intrinsic::ppc_stbcx:
989 case Intrinsic::ppc_sthcx:
990 case Intrinsic::ppc_stdcx:
991 case Intrinsic::ppc_stwcx: {
992 Info.PtrVal = Inst->getArgOperand(0);
993 Info.ReadMem = false;
994 Info.WriteMem = true;
995 return true;
996 }
997 default:
998 break;
999 }
1000
1001 return false;
1002}
1003
1005 Align Alignment) const {
1006 // Only load and stores instructions can have variable vector length on Power.
1007 if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1008 return false;
1009 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1010 // therefore cannot be used in 32-bit mode.
1011 if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1012 return false;
1013 if (isa<FixedVectorType>(DataType)) {
1014 unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1015 return VecWidth == 128;
1016 }
1017 Type *ScalarTy = DataType->getScalarType();
1018
1019 if (ScalarTy->isPointerTy())
1020 return true;
1021
1022 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1023 return true;
1024
1025 if (!ScalarTy->isIntegerTy())
1026 return false;
1027
1028 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1029 return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1030}
1031
1033 Align Alignment,
1034 unsigned AddressSpace,
1036 const Instruction *I) {
1039 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1040 return Cost;
1041 // TODO: Handle other cost kinds.
1043 return Cost;
1044
1045 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1046 "Invalid Opcode");
1047
1048 auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1049 assert(SrcVTy && "Expected a vector type for VP memory operations");
1050
1051 if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1052 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
1053
1054 InstructionCost CostFactor =
1055 vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1056 if (!CostFactor.isValid())
1057 return InstructionCost::getMax();
1058
1059 InstructionCost Cost = LT.first * CostFactor;
1060 assert(Cost.isValid() && "Expected valid cost");
1061
1062 // On P9 but not on P10, if the op is misaligned then it will cause a
1063 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1064 // ones.
1065 const Align DesiredAlignment(16);
1066 if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1067 return Cost;
1068
1069 // Since alignment may be under estimated, we try to compute the probability
1070 // that the actual address is aligned to the desired boundary. For example
1071 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1072 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1073 // aligned.
1074 float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1075 float MisalignmentProb = 1.0 - AlignmentProb;
1076 return (MisalignmentProb * P9PipelineFlushEstimate) +
1077 (AlignmentProb * *Cost.getValue());
1078 }
1079
1080 // Usually we should not get to this point, but the following is an attempt to
1081 // model the cost of legalization. Currently we can only lower intrinsics with
1082 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1083 return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1084}
1085
1087 return TLI->supportsTailCallFor(CB);
1088}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
static cl::opt< bool > VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden)
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfo::Concept conforming object specific to the PPC target machine.
if(VerifyEach)
const char LLVMTargetMachineRef TM
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static constexpr uint32_t Opcode
Definition: aarch32.h:200
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:556
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:865
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:628
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:829
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1227
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1394
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:748
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
This is an important base class in LLVM.
Definition: Constant.h:41
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:164
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2445
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2433
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:480
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2100
The core instruction combiner logic.
Definition: InstCombiner.h:46
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:375
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:374
BuilderTy & Builder
Definition: InstCombiner.h:59
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:372
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
The optimization diagnostic interface.
bool isPPC64() const
isPPC64 - Return true if we are generating code for 64-bit pointer mode.
unsigned getCPUDirective() const
getCPUDirective - Returns the -m directive specified for the cpu.
Definition: PPCSubtarget.h:134
POPCNTDKind hasPOPCNTD() const
Definition: PPCSubtarget.h:206
bool isLittleEndian() const
Definition: PPCSubtarget.h:181
const PPCTargetMachine & getTargetMachine() const
Definition: PPCSubtarget.h:155
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
const char * getRegisterClassName(unsigned ClassID) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useColdCCForColdCall(Function &F)
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool supportsTailCallFor(const CallBase *CB) const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getCacheLineSize() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
unsigned getMaxInterleaveFactor(ElementCount VF)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool enableAggressiveInterleaving(bool LoopHasReductions)
unsigned getPrefetchDistance() const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
unsigned getNumberOfRegisters(unsigned ClassID) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
bool shouldBuildRelLookupTables() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo)
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool supportsTailCallFor(const CallBase *CB) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Is unaligned memory access allowed for the given type, and is it fast relative to software emulation.
Common code between 32-bit and 64-bit PowerPC targets.
The main scalar evolution driver.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:333
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:336
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ DIR_PWR_FUTURE
Definition: PPCSubtarget.h:64
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:252
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1536
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1740
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...