LLVM 17.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
17#include "llvm/IR/IntrinsicsPowerPC.h"
20#include "llvm/Support/Debug.h"
24#include <optional>
25
26using namespace llvm;
27
28#define DEBUG_TYPE "ppctti"
29
30static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
31cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
32
33static cl::opt<bool>
34EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
35 cl::desc("Enable using coldcc calling conv for cold "
36 "internal functions"));
37
38static cl::opt<bool>
39LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
40 cl::desc("Do not add instruction count to lsr cost model"));
41
42// The latency of mtctr is only justified if there are more than 4
43// comparisons that will be removed as a result.
45SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
46 cl::desc("Loops with a constant trip count smaller than "
47 "this value will not use the count register."));
48
49//===----------------------------------------------------------------------===//
50//
51// PPC cost model.
52//
53//===----------------------------------------------------------------------===//
54
57 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
58 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
61 return TTI::PSK_Software;
62}
63
64std::optional<Instruction *>
67 switch (IID) {
68 default:
69 break;
70 case Intrinsic::ppc_altivec_lvx:
71 case Intrinsic::ppc_altivec_lvxl:
72 // Turn PPC lvx -> load if the pointer is known aligned.
74 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
75 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
78 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
79 }
80 break;
81 case Intrinsic::ppc_vsx_lxvw4x:
82 case Intrinsic::ppc_vsx_lxvd2x: {
83 // Turn PPC VSX loads into normal loads.
86 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
87 }
88 case Intrinsic::ppc_altivec_stvx:
89 case Intrinsic::ppc_altivec_stvxl:
90 // Turn stvx -> store if the pointer is known aligned.
92 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
93 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
95 Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
96 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
97 }
98 break;
99 case Intrinsic::ppc_vsx_stxvw4x:
100 case Intrinsic::ppc_vsx_stxvd2x: {
101 // Turn PPC VSX stores into normal stores.
103 Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
104 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
105 }
106 case Intrinsic::ppc_altivec_vperm:
107 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
108 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
109 // a vectorshuffle for little endian, we must undo the transformation
110 // performed on vec_perm in altivec.h. That is, we must complement
111 // the permutation mask with respect to 31 and reverse the order of
112 // V1 and V2.
113 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
114 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
115 "Bad type for intrinsic!");
116
117 // Check that all of the elements are integer constants or undefs.
118 bool AllEltsOk = true;
119 for (unsigned i = 0; i != 16; ++i) {
120 Constant *Elt = Mask->getAggregateElement(i);
121 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
122 AllEltsOk = false;
123 break;
124 }
125 }
126
127 if (AllEltsOk) {
128 // Cast the input vectors to byte vectors.
129 Value *Op0 =
130 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
131 Value *Op1 =
132 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
133 Value *Result = UndefValue::get(Op0->getType());
134
135 // Only extract each element once.
136 Value *ExtractedElts[32];
137 memset(ExtractedElts, 0, sizeof(ExtractedElts));
138
139 for (unsigned i = 0; i != 16; ++i) {
140 if (isa<UndefValue>(Mask->getAggregateElement(i)))
141 continue;
142 unsigned Idx =
143 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
144 Idx &= 31; // Match the hardware behavior.
145 if (DL.isLittleEndian())
146 Idx = 31 - Idx;
147
148 if (!ExtractedElts[Idx]) {
149 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
150 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
151 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
152 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
153 }
154
155 // Insert this value into the result vector.
156 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
157 IC.Builder.getInt32(i));
158 }
159 return CastInst::Create(Instruction::BitCast, Result, II.getType());
160 }
161 }
162 break;
163 }
164 return std::nullopt;
165}
166
170 return BaseT::getIntImmCost(Imm, Ty, CostKind);
171
172 assert(Ty->isIntegerTy());
173
174 unsigned BitSize = Ty->getPrimitiveSizeInBits();
175 if (BitSize == 0)
176 return ~0U;
177
178 if (Imm == 0)
179 return TTI::TCC_Free;
180
181 if (Imm.getBitWidth() <= 64) {
182 if (isInt<16>(Imm.getSExtValue()))
183 return TTI::TCC_Basic;
184
185 if (isInt<32>(Imm.getSExtValue())) {
186 // A constant that can be materialized using lis.
187 if ((Imm.getZExtValue() & 0xFFFF) == 0)
188 return TTI::TCC_Basic;
189
190 return 2 * TTI::TCC_Basic;
191 }
192 }
193
194 return 4 * TTI::TCC_Basic;
195}
196
198 const APInt &Imm, Type *Ty,
201 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
202
203 assert(Ty->isIntegerTy());
204
205 unsigned BitSize = Ty->getPrimitiveSizeInBits();
206 if (BitSize == 0)
207 return ~0U;
208
209 switch (IID) {
210 default:
211 return TTI::TCC_Free;
212 case Intrinsic::sadd_with_overflow:
213 case Intrinsic::uadd_with_overflow:
214 case Intrinsic::ssub_with_overflow:
215 case Intrinsic::usub_with_overflow:
216 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
217 return TTI::TCC_Free;
218 break;
219 case Intrinsic::experimental_stackmap:
220 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
221 return TTI::TCC_Free;
222 break;
223 case Intrinsic::experimental_patchpoint_void:
224 case Intrinsic::experimental_patchpoint_i64:
225 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
226 return TTI::TCC_Free;
227 break;
228 }
229 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
230}
231
233 const APInt &Imm, Type *Ty,
235 Instruction *Inst) {
237 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
238
239 assert(Ty->isIntegerTy());
240
241 unsigned BitSize = Ty->getPrimitiveSizeInBits();
242 if (BitSize == 0)
243 return ~0U;
244
245 unsigned ImmIdx = ~0U;
246 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
247 ZeroFree = false;
248 switch (Opcode) {
249 default:
250 return TTI::TCC_Free;
251 case Instruction::GetElementPtr:
252 // Always hoist the base address of a GetElementPtr. This prevents the
253 // creation of new constants for every base constant that gets constant
254 // folded with the offset.
255 if (Idx == 0)
256 return 2 * TTI::TCC_Basic;
257 return TTI::TCC_Free;
258 case Instruction::And:
259 RunFree = true; // (for the rotate-and-mask instructions)
260 [[fallthrough]];
261 case Instruction::Add:
262 case Instruction::Or:
263 case Instruction::Xor:
264 ShiftedFree = true;
265 [[fallthrough]];
266 case Instruction::Sub:
267 case Instruction::Mul:
268 case Instruction::Shl:
269 case Instruction::LShr:
270 case Instruction::AShr:
271 ImmIdx = 1;
272 break;
273 case Instruction::ICmp:
274 UnsignedFree = true;
275 ImmIdx = 1;
276 // Zero comparisons can use record-form instructions.
277 [[fallthrough]];
278 case Instruction::Select:
279 ZeroFree = true;
280 break;
281 case Instruction::PHI:
282 case Instruction::Call:
283 case Instruction::Ret:
284 case Instruction::Load:
285 case Instruction::Store:
286 break;
287 }
288
289 if (ZeroFree && Imm == 0)
290 return TTI::TCC_Free;
291
292 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
293 if (isInt<16>(Imm.getSExtValue()))
294 return TTI::TCC_Free;
295
296 if (RunFree) {
297 if (Imm.getBitWidth() <= 32 &&
298 (isShiftedMask_32(Imm.getZExtValue()) ||
299 isShiftedMask_32(~Imm.getZExtValue())))
300 return TTI::TCC_Free;
301
302 if (ST->isPPC64() &&
303 (isShiftedMask_64(Imm.getZExtValue()) ||
304 isShiftedMask_64(~Imm.getZExtValue())))
305 return TTI::TCC_Free;
306 }
307
308 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
309 return TTI::TCC_Free;
310
311 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
312 return TTI::TCC_Free;
313 }
314
315 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
316}
317
318// Check if the current Type is an MMA vector type. Valid MMA types are
319// v256i1 and v512i1 respectively.
320static bool isMMAType(Type *Ty) {
321 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
322 (Ty->getPrimitiveSizeInBits() > 128);
323}
324
328 // We already implement getCastInstrCost and getMemoryOpCost where we perform
329 // the vector adjustment there.
330 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
332
333 if (U->getType()->isVectorTy()) {
334 // Instructions that need to be split should cost more.
335 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
336 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
337 }
338
340}
341
343 AssumptionCache &AC,
344 TargetLibraryInfo *LibInfo,
345 HardwareLoopInfo &HWLoopInfo) {
346 const PPCTargetMachine &TM = ST->getTargetMachine();
347 TargetSchedModel SchedModel;
348 SchedModel.init(ST);
349
350 // Do not convert small short loops to CTR loop.
351 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
352 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
354 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
356 for (BasicBlock *BB : L->blocks())
357 Metrics.analyzeBasicBlock(BB, *this, EphValues);
358 // 6 is an approximate latency for the mtctr instruction.
359 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
360 return false;
361 }
362
363 // Check that there is no hardware loop related intrinsics in the loop.
364 for (auto *BB : L->getBlocks())
365 for (auto &I : *BB)
366 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
367 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
368 Call->getIntrinsicID() == Intrinsic::loop_decrement)
369 return false;
370
371 SmallVector<BasicBlock*, 4> ExitingBlocks;
372 L->getExitingBlocks(ExitingBlocks);
373
374 // If there is an exit edge known to be frequently taken,
375 // we should not transform this loop.
376 for (auto &BB : ExitingBlocks) {
377 Instruction *TI = BB->getTerminator();
378 if (!TI) continue;
379
380 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
381 uint64_t TrueWeight = 0, FalseWeight = 0;
382 if (!BI->isConditional() ||
383 !extractBranchWeights(*BI, TrueWeight, FalseWeight))
384 continue;
385
386 // If the exit path is more frequent than the loop path,
387 // we return here without further analysis for this loop.
388 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
389 if (( TrueIsExit && FalseWeight < TrueWeight) ||
390 (!TrueIsExit && FalseWeight > TrueWeight))
391 return false;
392 }
393 }
394
395 LLVMContext &C = L->getHeader()->getContext();
396 HWLoopInfo.CountType = TM.isPPC64() ?
398 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
399 return true;
400}
401
405 if (ST->getCPUDirective() == PPC::DIR_A2) {
406 // The A2 is in-order with a deep pipeline, and concatenation unrolling
407 // helps expose latency-hiding opportunities to the instruction scheduler.
408 UP.Partial = UP.Runtime = true;
409
410 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
411 // often outweigh the cost of a division to compute the trip count.
412 UP.AllowExpensiveTripCount = true;
413 }
414
415 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
416}
417
421}
422// This function returns true to allow using coldcc calling convention.
423// Returning true results in coldcc being used for functions which are cold at
424// all call sites when the callers of the functions are not calling any other
425// non coldcc functions.
427 return EnablePPCColdCC;
428}
429
430bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
431 // On the A2, always unroll aggressively.
432 if (ST->getCPUDirective() == PPC::DIR_A2)
433 return true;
434
435 return LoopHasReductions;
436}
437
439PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
441 Options.LoadSizes = {8, 4, 2, 1};
442 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
443 return Options;
444}
445
447 return true;
448}
449
450unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
451 assert(ClassID == GPRRC || ClassID == FPRRC ||
452 ClassID == VRRC || ClassID == VSXRC);
453 if (ST->hasVSX()) {
454 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
455 return ClassID == VSXRC ? 64 : 32;
456 }
457 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
458 return 32;
459}
460
462 if (Vector)
463 return ST->hasVSX() ? VSXRC : VRRC;
464 else if (Ty && (Ty->getScalarType()->isFloatTy() ||
465 Ty->getScalarType()->isDoubleTy()))
466 return ST->hasVSX() ? VSXRC : FPRRC;
467 else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
469 return VRRC;
470 else if (Ty && Ty->getScalarType()->isHalfTy())
471 return VSXRC;
472 else
473 return GPRRC;
474}
475
476const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
477
478 switch (ClassID) {
479 default:
480 llvm_unreachable("unknown register class");
481 return "PPC::unknown register class";
482 case GPRRC: return "PPC::GPRRC";
483 case FPRRC: return "PPC::FPRRC";
484 case VRRC: return "PPC::VRRC";
485 case VSXRC: return "PPC::VSXRC";
486 }
487}
488
491 switch (K) {
493 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
495 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
497 return TypeSize::getScalable(0);
498 }
499
500 llvm_unreachable("Unsupported register kind");
501}
502
504 // Starting with P7 we have a cache line size of 128.
505 unsigned Directive = ST->getCPUDirective();
506 // Assume that Future CPU has the same cache line size as the others.
510 return 128;
511
512 // On other processors return a default of 64 bytes.
513 return 64;
514}
515
517 return 300;
518}
519
521 unsigned Directive = ST->getCPUDirective();
522 // The 440 has no SIMD support, but floating-point instructions
523 // have a 5-cycle latency, so unroll by 5x for latency hiding.
524 if (Directive == PPC::DIR_440)
525 return 5;
526
527 // The A2 has no SIMD support, but floating-point instructions
528 // have a 6-cycle latency, so unroll by 6x for latency hiding.
529 if (Directive == PPC::DIR_A2)
530 return 6;
531
532 // FIXME: For lack of any better information, do no harm...
534 return 1;
535
536 // For P7 and P8, floating-point instructions have a 6-cycle latency and
537 // there are two execution units, so unroll by 12x for latency hiding.
538 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
539 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
540 // Assume that future is the same as the others.
544 return 12;
545
546 // For most things, modern systems have two execution units (and
547 // out-of-order execution).
548 return 2;
549}
550
551// Returns a cost adjustment factor to adjust the cost of vector instructions
552// on targets which there is overlap between the vector and scalar units,
553// thereby reducing the overall throughput of vector code wrt. scalar code.
554// An invalid instruction cost is returned if the type is an MMA vector type.
556 Type *Ty1, Type *Ty2) {
557 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
558 // instruction cost is returned. This is to signify to other cost computing
559 // functions to return the maximum instruction cost in order to prevent any
560 // opportunities for the optimizer to produce MMA types within the IR.
561 if (isMMAType(Ty1))
563
564 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
565 return InstructionCost(1);
566
567 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
568 // If type legalization involves splitting the vector, we don't want to
569 // double the cost at every step - only the last step.
570 if (LT1.first != 1 || !LT1.second.isVector())
571 return InstructionCost(1);
572
573 int ISD = TLI->InstructionOpcodeToISD(Opcode);
574 if (TLI->isOperationExpand(ISD, LT1.second))
575 return InstructionCost(1);
576
577 if (Ty2) {
578 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
579 if (LT2.first != 1 || !LT2.second.isVector())
580 return InstructionCost(1);
581 }
582
583 return InstructionCost(2);
584}
585
587 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
590 const Instruction *CxtI) {
591 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
592
593 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
594 if (!CostFactor.isValid())
596
597 // TODO: Handle more cost kinds.
599 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
600 Op2Info, Args, CxtI);
601
602 // Fallback to the default implementation.
604 Opcode, Ty, CostKind, Op1Info, Op2Info);
605 return Cost * CostFactor;
606}
607
609 ArrayRef<int> Mask,
611 int Index, Type *SubTp,
613
614 InstructionCost CostFactor =
615 vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
616 if (!CostFactor.isValid())
618
619 // Legalize the type.
620 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
621
622 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
623 // (at least in the sense that there need only be one non-loop-invariant
624 // instruction). We need one such shuffle instruction for each actual
625 // register (this is not true for arbitrary shuffles, but is true for the
626 // structured types of shuffles covered by TTI::ShuffleKind).
627 return LT.first * CostFactor;
628}
629
632 const Instruction *I) {
634 return Opcode == Instruction::PHI ? 0 : 1;
635 // Branches are assumed to be predicted.
636 return 0;
637}
638
640 Type *Src,
643 const Instruction *I) {
644 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
645
646 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
647 if (!CostFactor.isValid())
649
651 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
652 Cost *= CostFactor;
653 // TODO: Allow non-throughput costs that aren't binary.
655 return Cost == 0 ? 0 : 1;
656 return Cost;
657}
658
660 Type *CondTy,
661 CmpInst::Predicate VecPred,
663 const Instruction *I) {
664 InstructionCost CostFactor =
665 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
666 if (!CostFactor.isValid())
668
670 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
671 // TODO: Handle other cost kinds.
673 return Cost;
674 return Cost * CostFactor;
675}
676
679 unsigned Index, Value *Op0,
680 Value *Op1) {
681 assert(Val->isVectorTy() && "This must be a vector type");
682
683 int ISD = TLI->InstructionOpcodeToISD(Opcode);
684 assert(ISD && "Invalid opcode");
685
686 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
687 if (!CostFactor.isValid())
689
691 BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
692 Cost *= CostFactor;
693
694 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
695 // Double-precision scalars are already located in index #0 (or #1 if LE).
696 if (ISD == ISD::EXTRACT_VECTOR_ELT &&
697 Index == (ST->isLittleEndian() ? 1 : 0))
698 return 0;
699
700 return Cost;
701
702 } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
703 if (ST->hasP9Altivec()) {
704 if (ISD == ISD::INSERT_VECTOR_ELT)
705 // A move-to VSR and a permute/insert. Assume vector operation cost
706 // for both (cost will be 2x on P9).
707 return 2 * CostFactor;
708
709 // It's an extract. Maybe we can do a cheap move-from VSR.
710 unsigned EltSize = Val->getScalarSizeInBits();
711 if (EltSize == 64) {
712 unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
713 if (Index == MfvsrdIndex)
714 return 1;
715 } else if (EltSize == 32) {
716 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
717 if (Index == MfvsrwzIndex)
718 return 1;
719 }
720
721 // We need a vector extract (or mfvsrld). Assume vector operation cost.
722 // The cost of the load constant for a vector extract is disregarded
723 // (invariant, easily schedulable).
724 return CostFactor;
725
726 } else if (ST->hasDirectMove())
727 // Assume permute has standard cost.
728 // Assume move-to/move-from VSR have 2x standard cost.
729 return 3;
730 }
731
732 // Estimated cost of a load-hit-store delay. This was obtained
733 // experimentally as a minimum needed to prevent unprofitable
734 // vectorization for the paq8p benchmark. It may need to be
735 // raised further if other unprofitable cases remain.
736 unsigned LHSPenalty = 2;
737 if (ISD == ISD::INSERT_VECTOR_ELT)
738 LHSPenalty += 7;
739
740 // Vector element insert/extract with Altivec is very expensive,
741 // because they require store and reload with the attendant
742 // processor stall for load-hit-store. Until VSX is available,
743 // these need to be estimated as very costly.
744 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
746 return LHSPenalty + Cost;
747
748 return Cost;
749}
750
752 MaybeAlign Alignment,
753 unsigned AddressSpace,
756 const Instruction *I) {
757
758 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
759 if (!CostFactor.isValid())
761
762 if (TLI->getValueType(DL, Src, true) == MVT::Other)
763 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
764 CostKind);
765 // Legalize the type.
766 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
767 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
768 "Invalid Opcode");
769
771 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
772 // TODO: Handle other cost kinds.
774 return Cost;
775
776 Cost *= CostFactor;
777
778 bool IsAltivecType = ST->hasAltivec() &&
779 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
780 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
781 bool IsVSXType = ST->hasVSX() &&
782 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
783
784 // VSX has 32b/64b load instructions. Legalization can handle loading of
785 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
786 // PPCTargetLowering can't compute the cost appropriately. So here we
787 // explicitly check this case.
788 unsigned MemBytes = Src->getPrimitiveSizeInBits();
789 if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
790 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
791 return 1;
792
793 // Aligned loads and stores are easy.
794 unsigned SrcBytes = LT.second.getStoreSize();
795 if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
796 return Cost;
797
798 // If we can use the permutation-based load sequence, then this is also
799 // relatively cheap (not counting loop-invariant instructions): one load plus
800 // one permute (the last load in a series has extra cost, but we're
801 // neglecting that here). Note that on the P7, we could do unaligned loads
802 // for Altivec types using the VSX instructions, but that's more expensive
803 // than using the permutation-based load sequence. On the P8, that's no
804 // longer true.
805 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
806 *Alignment >= LT.second.getScalarType().getStoreSize())
807 return Cost + LT.first; // Add the cost of the permutations.
808
809 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
810 // P7, unaligned vector loads are more expensive than the permutation-based
811 // load sequence, so that might be used instead, but regardless, the net cost
812 // is about the same (not counting loop-invariant instructions).
813 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
814 return Cost;
815
816 // Newer PPC supports unaligned memory access.
817 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
818 return Cost;
819
820 // PPC in general does not support unaligned loads and stores. They'll need
821 // to be decomposed based on the alignment factor.
822
823 // Add the cost of each scalar load or store.
824 assert(Alignment);
825 Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
826
827 // For a vector type, there is also scalarization overhead (only for
828 // stores, loads are expanded using the vector-load + permutation sequence,
829 // which is much less expensive).
830 if (Src->isVectorTy() && Opcode == Instruction::Store)
831 for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
832 ++i)
833 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
834 nullptr, nullptr);
835
836 return Cost;
837}
838
840 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
841 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
842 bool UseMaskForCond, bool UseMaskForGaps) {
843 InstructionCost CostFactor =
844 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
845 if (!CostFactor.isValid())
847
848 if (UseMaskForCond || UseMaskForGaps)
849 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
850 Alignment, AddressSpace, CostKind,
851 UseMaskForCond, UseMaskForGaps);
852
853 assert(isa<VectorType>(VecTy) &&
854 "Expect a vector type for interleaved memory op");
855
856 // Legalize the type.
857 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
858
859 // Firstly, the cost of load/store operation.
860 InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
862
863 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
864 // (at least in the sense that there need only be one non-loop-invariant
865 // instruction). For each result vector, we need one shuffle per incoming
866 // vector (except that the first shuffle can take two incoming vectors
867 // because it does not need to take itself).
868 Cost += Factor*(LT.first-1);
869
870 return Cost;
871}
872
877}
878
880 const Function *Callee,
881 const ArrayRef<Type *> &Types) const {
882
883 // We need to ensure that argument promotion does not
884 // attempt to promote pointers to MMA types (__vector_pair
885 // and __vector_quad) since these types explicitly cannot be
886 // passed as arguments. Both of these types are larger than
887 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
888 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
889 return false;
890
891 return llvm::none_of(Types, [](Type *Ty) {
892 if (Ty->isSized())
893 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
894 return false;
895 });
896}
897
899 LoopInfo *LI, DominatorTree *DT,
900 AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
901 // Process nested loops first.
902 for (Loop *I : *L)
903 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
904 return false; // Stop search.
905
906 HardwareLoopInfo HWLoopInfo(L);
907
908 if (!HWLoopInfo.canAnalyze(*LI))
909 return false;
910
911 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
912 return false;
913
914 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
915 return false;
916
917 *BI = HWLoopInfo.ExitBranch;
918 return true;
919}
920
923 // PowerPC default behaviour here is "instruction number 1st priority".
924 // If LsrNoInsnsCost is set, call default implementation.
925 if (!LsrNoInsnsCost)
926 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
927 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
928 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
929 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
930 else
932}
933
935 return false;
936}
937
939 const PPCTargetMachine &TM = ST->getTargetMachine();
940 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
941 if (!TM.isELFv2ABI())
942 return false;
944}
945
947 MemIntrinsicInfo &Info) {
948 switch (Inst->getIntrinsicID()) {
949 case Intrinsic::ppc_altivec_lvx:
950 case Intrinsic::ppc_altivec_lvxl:
951 case Intrinsic::ppc_altivec_lvebx:
952 case Intrinsic::ppc_altivec_lvehx:
953 case Intrinsic::ppc_altivec_lvewx:
954 case Intrinsic::ppc_vsx_lxvd2x:
955 case Intrinsic::ppc_vsx_lxvw4x:
956 case Intrinsic::ppc_vsx_lxvd2x_be:
957 case Intrinsic::ppc_vsx_lxvw4x_be:
958 case Intrinsic::ppc_vsx_lxvl:
959 case Intrinsic::ppc_vsx_lxvll:
960 case Intrinsic::ppc_vsx_lxvp: {
961 Info.PtrVal = Inst->getArgOperand(0);
962 Info.ReadMem = true;
963 Info.WriteMem = false;
964 return true;
965 }
966 case Intrinsic::ppc_altivec_stvx:
967 case Intrinsic::ppc_altivec_stvxl:
968 case Intrinsic::ppc_altivec_stvebx:
969 case Intrinsic::ppc_altivec_stvehx:
970 case Intrinsic::ppc_altivec_stvewx:
971 case Intrinsic::ppc_vsx_stxvd2x:
972 case Intrinsic::ppc_vsx_stxvw4x:
973 case Intrinsic::ppc_vsx_stxvd2x_be:
974 case Intrinsic::ppc_vsx_stxvw4x_be:
975 case Intrinsic::ppc_vsx_stxvl:
976 case Intrinsic::ppc_vsx_stxvll:
977 case Intrinsic::ppc_vsx_stxvp: {
978 Info.PtrVal = Inst->getArgOperand(1);
979 Info.ReadMem = false;
980 Info.WriteMem = true;
981 return true;
982 }
983 case Intrinsic::ppc_stbcx:
984 case Intrinsic::ppc_sthcx:
985 case Intrinsic::ppc_stdcx:
986 case Intrinsic::ppc_stwcx: {
987 Info.PtrVal = Inst->getArgOperand(0);
988 Info.ReadMem = false;
989 Info.WriteMem = true;
990 return true;
991 }
992 default:
993 break;
994 }
995
996 return false;
997}
998
999bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
1000 Align Alignment) const {
1001 // Only load and stores instructions can have variable vector length on Power.
1002 if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1003 return false;
1004 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1005 // therefore cannot be used in 32-bit mode.
1006 if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1007 return false;
1008 if (isa<FixedVectorType>(DataType)) {
1009 unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1010 return VecWidth == 128;
1011 }
1012 Type *ScalarTy = DataType->getScalarType();
1013
1014 if (ScalarTy->isPointerTy())
1015 return true;
1016
1017 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1018 return true;
1019
1020 if (!ScalarTy->isIntegerTy())
1021 return false;
1022
1023 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1024 return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1025}
1026
1028 Align Alignment,
1029 unsigned AddressSpace,
1031 const Instruction *I) {
1032 InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
1034 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1035 return Cost;
1036 // TODO: Handle other cost kinds.
1038 return Cost;
1039
1040 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1041 "Invalid Opcode");
1042
1043 auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1044 assert(SrcVTy && "Expected a vector type for VP memory operations");
1045
1046 if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1047 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
1048
1049 InstructionCost CostFactor =
1050 vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1051 if (!CostFactor.isValid())
1052 return InstructionCost::getMax();
1053
1054 InstructionCost Cost = LT.first * CostFactor;
1055 assert(Cost.isValid() && "Expected valid cost");
1056
1057 // On P9 but not on P10, if the op is misaligned then it will cause a
1058 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1059 // ones.
1060 const Align DesiredAlignment(16);
1061 if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1062 return Cost;
1063
1064 // Since alignment may be under estimated, we try to compute the probability
1065 // that the actual address is aligned to the desired boundary. For example
1066 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1067 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1068 // aligned.
1069 float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1070 float MisalignmentProb = 1.0 - AlignmentProb;
1071 return (MisalignmentProb * P9PipelineFlushEstimate) +
1072 (AlignmentProb * *Cost.getValue());
1073 }
1074
1075 // Usually we should not get to this point, but the following is an attempt to
1076 // model the cost of legalization. Currently we can only lower intrinsics with
1077 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1078 return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1079}
1080
1082 return TLI->supportsTailCallFor(CB);
1083}
amdgpu Simplify well known AMD library false FunctionCallee Callee
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfo::Concept conforming object specific to the PPC target machine.
if(VerifyEach)
const char LLVMTargetMachineRef TM
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:850
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:814
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:994
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1186
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
This is an important base class in LLVM.
Definition: Constant.h:41
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2361
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2349
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:472
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2016
The core instruction combiner logic.
Definition: InstCombiner.h:45
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:372
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:371
BuilderTy & Builder
Definition: InstCombiner.h:58
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:369
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
The optimization diagnostic interface.
bool isPPC64() const
isPPC64 - Return true if we are generating code for 64-bit pointer mode.
unsigned getCPUDirective() const
getCPUDirective - Returns the -m directive specified for the cpu.
Definition: PPCSubtarget.h:134
POPCNTDKind hasPOPCNTD() const
Definition: PPCSubtarget.h:206
bool isLittleEndian() const
Definition: PPCSubtarget.h:181
const PPCTargetMachine & getTargetMachine() const
Definition: PPCSubtarget.h:155
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
const char * getRegisterClassName(unsigned ClassID) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useColdCCForColdCall(Function &F)
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool supportsTailCallFor(const CallBase *CB) const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getCacheLineSize() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
unsigned getMaxInterleaveFactor(ElementCount VF)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool enableAggressiveInterleaving(bool LoopHasReductions)
unsigned getPrefetchDistance() const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
unsigned getNumberOfRegisters(unsigned ClassID) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
bool shouldBuildRelLookupTables() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo)
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool supportsTailCallFor(const CallBase *CB) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Is unaligned memory access allowed for the given type, and is it fast relative to software emulation.
Common code between 32-bit and 64-bit PowerPC targets.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:651
The main scalar evolution driver.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:237
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:258
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:304
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:231
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1739
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:534
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:523
@ DIR_PWR_FUTURE
Definition: PPCSubtarget.h:64
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:280
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1439
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1796
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...