LLVM 17.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/Support/Debug.h"
23using namespace llvm;
24
25#define DEBUG_TYPE "systemztti"
26
27//===----------------------------------------------------------------------===//
28//
29// SystemZ cost model.
30//
31//===----------------------------------------------------------------------===//
32
33static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
34 bool UsedAsMemCpySource = false;
35 for (const User *U : V->users())
36 if (const Instruction *User = dyn_cast<Instruction>(U)) {
37 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
38 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
39 continue;
40 }
41 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
42 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
43 UsedAsMemCpySource = true;
44 continue;
45 }
46 }
47 OtherUse = true;
48 }
49 return UsedAsMemCpySource;
50}
51
53 unsigned Bonus = 0;
54
55 // Increase the threshold if an incoming argument is used only as a memcpy
56 // source.
58 for (Argument &Arg : Callee->args()) {
59 bool OtherUse = false;
60 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
61 Bonus += 150;
62 }
63
64 LLVM_DEBUG(if (Bonus)
65 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
66 return Bonus;
67}
68
71 assert(Ty->isIntegerTy());
72
73 unsigned BitSize = Ty->getPrimitiveSizeInBits();
74 // There is no cost model for constants with a bit size of 0. Return TCC_Free
75 // here, so that constant hoisting will ignore this constant.
76 if (BitSize == 0)
77 return TTI::TCC_Free;
78 // No cost model for operations on integers larger than 64 bit implemented yet.
79 if (BitSize > 64)
80 return TTI::TCC_Free;
81
82 if (Imm == 0)
83 return TTI::TCC_Free;
84
85 if (Imm.getBitWidth() <= 64) {
86 // Constants loaded via lgfi.
87 if (isInt<32>(Imm.getSExtValue()))
88 return TTI::TCC_Basic;
89 // Constants loaded via llilf.
90 if (isUInt<32>(Imm.getZExtValue()))
91 return TTI::TCC_Basic;
92 // Constants loaded via llihf:
93 if ((Imm.getZExtValue() & 0xffffffff) == 0)
94 return TTI::TCC_Basic;
95
96 return 2 * TTI::TCC_Basic;
97 }
98
99 return 4 * TTI::TCC_Basic;
100}
101
103 const APInt &Imm, Type *Ty,
105 Instruction *Inst) {
106 assert(Ty->isIntegerTy());
107
108 unsigned BitSize = Ty->getPrimitiveSizeInBits();
109 // There is no cost model for constants with a bit size of 0. Return TCC_Free
110 // here, so that constant hoisting will ignore this constant.
111 if (BitSize == 0)
112 return TTI::TCC_Free;
113 // No cost model for operations on integers larger than 64 bit implemented yet.
114 if (BitSize > 64)
115 return TTI::TCC_Free;
116
117 switch (Opcode) {
118 default:
119 return TTI::TCC_Free;
120 case Instruction::GetElementPtr:
121 // Always hoist the base address of a GetElementPtr. This prevents the
122 // creation of new constants for every base constant that gets constant
123 // folded with the offset.
124 if (Idx == 0)
125 return 2 * TTI::TCC_Basic;
126 return TTI::TCC_Free;
127 case Instruction::Store:
128 if (Idx == 0 && Imm.getBitWidth() <= 64) {
129 // Any 8-bit immediate store can by implemented via mvi.
130 if (BitSize == 8)
131 return TTI::TCC_Free;
132 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
133 if (isInt<16>(Imm.getSExtValue()))
134 return TTI::TCC_Free;
135 }
136 break;
137 case Instruction::ICmp:
138 if (Idx == 1 && Imm.getBitWidth() <= 64) {
139 // Comparisons against signed 32-bit immediates implemented via cgfi.
140 if (isInt<32>(Imm.getSExtValue()))
141 return TTI::TCC_Free;
142 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
143 if (isUInt<32>(Imm.getZExtValue()))
144 return TTI::TCC_Free;
145 }
146 break;
147 case Instruction::Add:
148 case Instruction::Sub:
149 if (Idx == 1 && Imm.getBitWidth() <= 64) {
150 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
151 if (isUInt<32>(Imm.getZExtValue()))
152 return TTI::TCC_Free;
153 // Or their negation, by swapping addition vs. subtraction.
154 if (isUInt<32>(-Imm.getSExtValue()))
155 return TTI::TCC_Free;
156 }
157 break;
158 case Instruction::Mul:
159 if (Idx == 1 && Imm.getBitWidth() <= 64) {
160 // We use msgfi to multiply by 32-bit signed immediates.
161 if (isInt<32>(Imm.getSExtValue()))
162 return TTI::TCC_Free;
163 }
164 break;
165 case Instruction::Or:
166 case Instruction::Xor:
167 if (Idx == 1 && Imm.getBitWidth() <= 64) {
168 // Masks supported by oilf/xilf.
169 if (isUInt<32>(Imm.getZExtValue()))
170 return TTI::TCC_Free;
171 // Masks supported by oihf/xihf.
172 if ((Imm.getZExtValue() & 0xffffffff) == 0)
173 return TTI::TCC_Free;
174 }
175 break;
176 case Instruction::And:
177 if (Idx == 1 && Imm.getBitWidth() <= 64) {
178 // Any 32-bit AND operation can by implemented via nilf.
179 if (BitSize <= 32)
180 return TTI::TCC_Free;
181 // 64-bit masks supported by nilf.
182 if (isUInt<32>(~Imm.getZExtValue()))
183 return TTI::TCC_Free;
184 // 64-bit masks supported by nilh.
185 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
186 return TTI::TCC_Free;
187 // Some 64-bit AND operations can be implemented via risbg.
188 const SystemZInstrInfo *TII = ST->getInstrInfo();
189 unsigned Start, End;
190 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
191 return TTI::TCC_Free;
192 }
193 break;
194 case Instruction::Shl:
195 case Instruction::LShr:
196 case Instruction::AShr:
197 // Always return TCC_Free for the shift value of a shift instruction.
198 if (Idx == 1)
199 return TTI::TCC_Free;
200 break;
201 case Instruction::UDiv:
202 case Instruction::SDiv:
203 case Instruction::URem:
204 case Instruction::SRem:
205 case Instruction::Trunc:
206 case Instruction::ZExt:
207 case Instruction::SExt:
208 case Instruction::IntToPtr:
209 case Instruction::PtrToInt:
210 case Instruction::BitCast:
211 case Instruction::PHI:
212 case Instruction::Call:
213 case Instruction::Select:
214 case Instruction::Ret:
215 case Instruction::Load:
216 break;
217 }
218
220}
221
224 const APInt &Imm, Type *Ty,
226 assert(Ty->isIntegerTy());
227
228 unsigned BitSize = Ty->getPrimitiveSizeInBits();
229 // There is no cost model for constants with a bit size of 0. Return TCC_Free
230 // here, so that constant hoisting will ignore this constant.
231 if (BitSize == 0)
232 return TTI::TCC_Free;
233 // No cost model for operations on integers larger than 64 bit implemented yet.
234 if (BitSize > 64)
235 return TTI::TCC_Free;
236
237 switch (IID) {
238 default:
239 return TTI::TCC_Free;
240 case Intrinsic::sadd_with_overflow:
241 case Intrinsic::uadd_with_overflow:
242 case Intrinsic::ssub_with_overflow:
243 case Intrinsic::usub_with_overflow:
244 // These get expanded to include a normal addition/subtraction.
245 if (Idx == 1 && Imm.getBitWidth() <= 64) {
246 if (isUInt<32>(Imm.getZExtValue()))
247 return TTI::TCC_Free;
248 if (isUInt<32>(-Imm.getSExtValue()))
249 return TTI::TCC_Free;
250 }
251 break;
252 case Intrinsic::smul_with_overflow:
253 case Intrinsic::umul_with_overflow:
254 // These get expanded to include a normal multiplication.
255 if (Idx == 1 && Imm.getBitWidth() <= 64) {
256 if (isInt<32>(Imm.getSExtValue()))
257 return TTI::TCC_Free;
258 }
259 break;
260 case Intrinsic::experimental_stackmap:
261 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
262 return TTI::TCC_Free;
263 break;
264 case Intrinsic::experimental_patchpoint_void:
265 case Intrinsic::experimental_patchpoint_i64:
266 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
267 return TTI::TCC_Free;
268 break;
269 }
271}
272
275 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
276 if (ST->hasPopulationCount() && TyWidth <= 64)
278 return TTI::PSK_Software;
279}
280
284 // Find out if L contains a call, what the machine instruction count
285 // estimate is, and how many stores there are.
286 bool HasCall = false;
287 InstructionCost NumStores = 0;
288 for (auto &BB : L->blocks())
289 for (auto &I : *BB) {
290 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
291 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
292 if (isLoweredToCall(F))
293 HasCall = true;
294 if (F->getIntrinsicID() == Intrinsic::memcpy ||
295 F->getIntrinsicID() == Intrinsic::memset)
296 NumStores++;
297 } else { // indirect call.
298 HasCall = true;
299 }
300 }
301 if (isa<StoreInst>(&I)) {
302 Type *MemAccessTy = I.getOperand(0)->getType();
303 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
304 std::nullopt, 0, TTI::TCK_RecipThroughput);
305 }
306 }
307
308 // The z13 processor will run out of store tags if too many stores
309 // are fed into it too quickly. Therefore make sure there are not
310 // too many stores in the resulting unrolled loop.
311 unsigned const NumStoresVal = *NumStores.getValue();
312 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
313
314 if (HasCall) {
315 // Only allow full unrolling if loop has any calls.
316 UP.FullUnrollMaxCount = Max;
317 UP.MaxCount = 1;
318 return;
319 }
320
321 UP.MaxCount = Max;
322 if (UP.MaxCount <= 1)
323 return;
324
325 // Allow partial and runtime trip count unrolling.
326 UP.Partial = UP.Runtime = true;
327
328 UP.PartialThreshold = 75;
330
331 // Allow expensive instructions in the pre-header of the loop.
332 UP.AllowExpensiveTripCount = true;
333
334 UP.Force = true;
335}
336
340}
341
344 // SystemZ specific: check instruction count (first), and don't care about
345 // ImmCost, since offsets are checked explicitly.
346 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
347 C1.NumIVMuls, C1.NumBaseAdds,
348 C1.ScaleCost, C1.SetupCost) <
349 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
350 C2.NumIVMuls, C2.NumBaseAdds,
351 C2.ScaleCost, C2.SetupCost);
352}
353
354unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
355 bool Vector = (ClassID == 1);
356 if (!Vector)
357 // Discount the stack pointer. Also leave out %r0, since it can't
358 // be used in an address.
359 return 14;
360 if (ST->hasVector())
361 return 32;
362 return 0;
363}
364
367 switch (K) {
369 return TypeSize::getFixed(64);
371 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
373 return TypeSize::getScalable(0);
374 }
375
376 llvm_unreachable("Unsupported register kind");
377}
378
379unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
380 unsigned NumStridedMemAccesses,
381 unsigned NumPrefetches,
382 bool HasCall) const {
383 // Don't prefetch a loop with many far apart accesses.
384 if (NumPrefetches > 16)
385 return UINT_MAX;
386
387 // Emit prefetch instructions for smaller strides in cases where we think
388 // the hardware prefetcher might not be able to keep up.
389 if (NumStridedMemAccesses > 32 && !HasCall &&
390 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
391 return 1;
392
393 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
394}
395
396bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
397 EVT VT = TLI->getValueType(DL, DataType);
398 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
399}
400
401// Return the bit size for the scalar type or vector element
402// type. getScalarSizeInBits() returns 0 for a pointer type.
403static unsigned getScalarSizeInBits(Type *Ty) {
404 unsigned Size =
405 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
406 assert(Size > 0 && "Element must have non-zero size.");
407 return Size;
408}
409
410// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
411// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
412// 3.
413static unsigned getNumVectorRegs(Type *Ty) {
414 auto *VTy = cast<FixedVectorType>(Ty);
415 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
416 assert(WideBits > 0 && "Could not compute size of vector");
417 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
418}
419
421 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
424 const Instruction *CxtI) {
425
426 // TODO: Handle more cost kinds.
428 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
429 Op2Info, Args, CxtI);
430
431 // TODO: return a good value for BB-VECTORIZER that includes the
432 // immediate loads, which we do not want to count for the loop
433 // vectorizer, since they are hopefully hoisted out of the loop. This
434 // would require a new parameter 'InLoop', but not sure if constant
435 // args are common enough to motivate this.
436
437 unsigned ScalarBits = Ty->getScalarSizeInBits();
438
439 // There are thre cases of division and remainder: Dividing with a register
440 // needs a divide instruction. A divisor which is a power of two constant
441 // can be implemented with a sequence of shifts. Any other constant needs a
442 // multiply and shifts.
443 const unsigned DivInstrCost = 20;
444 const unsigned DivMulSeqCost = 10;
445 const unsigned SDivPow2Cost = 4;
446
447 bool SignedDivRem =
448 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
449 bool UnsignedDivRem =
450 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
451
452 // Check for a constant divisor.
453 bool DivRemConst = false;
454 bool DivRemConstPow2 = false;
455 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
456 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
457 const ConstantInt *CVal =
458 (C->getType()->isVectorTy()
459 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
460 : dyn_cast<const ConstantInt>(C));
461 if (CVal && (CVal->getValue().isPowerOf2() ||
462 CVal->getValue().isNegatedPowerOf2()))
463 DivRemConstPow2 = true;
464 else
465 DivRemConst = true;
466 }
467 }
468
469 if (!Ty->isVectorTy()) {
470 // These FP operations are supported with a dedicated instruction for
471 // float, double and fp128 (base implementation assumes float generally
472 // costs 2).
473 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
474 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
475 return 1;
476
477 // There is no native support for FRem.
478 if (Opcode == Instruction::FRem)
479 return LIBCALL_COST;
480
481 // Give discount for some combined logical operations if supported.
482 if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
483 if (Opcode == Instruction::Xor) {
484 for (const Value *A : Args) {
485 if (const Instruction *I = dyn_cast<Instruction>(A))
486 if (I->hasOneUse() &&
487 (I->getOpcode() == Instruction::And ||
488 I->getOpcode() == Instruction::Or ||
489 I->getOpcode() == Instruction::Xor))
490 return 0;
491 }
492 }
493 else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
494 for (const Value *A : Args) {
495 if (const Instruction *I = dyn_cast<Instruction>(A))
496 if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
497 return 0;
498 }
499 }
500 }
501
502 // Or requires one instruction, although it has custom handling for i64.
503 if (Opcode == Instruction::Or)
504 return 1;
505
506 if (Opcode == Instruction::Xor && ScalarBits == 1) {
507 if (ST->hasLoadStoreOnCond2())
508 return 5; // 2 * (li 0; loc 1); xor
509 return 7; // 2 * ipm sequences ; xor ; shift ; compare
510 }
511
512 if (DivRemConstPow2)
513 return (SignedDivRem ? SDivPow2Cost : 1);
514 if (DivRemConst)
515 return DivMulSeqCost;
516 if (SignedDivRem || UnsignedDivRem)
517 return DivInstrCost;
518 }
519 else if (ST->hasVector()) {
520 auto *VTy = cast<FixedVectorType>(Ty);
521 unsigned VF = VTy->getNumElements();
522 unsigned NumVectors = getNumVectorRegs(Ty);
523
524 // These vector operations are custom handled, but are still supported
525 // with one instruction per vector, regardless of element size.
526 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
527 Opcode == Instruction::AShr) {
528 return NumVectors;
529 }
530
531 if (DivRemConstPow2)
532 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
533 if (DivRemConst) {
534 SmallVector<Type *> Tys(Args.size(), Ty);
535 return VF * DivMulSeqCost +
536 getScalarizationOverhead(VTy, Args, Tys, CostKind);
537 }
538 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
539 // Temporary hack: disable high vectorization factors with integer
540 // division/remainder, which will get scalarized and handled with
541 // GR128 registers. The mischeduler is not clever enough to avoid
542 // spilling yet.
543 return 1000;
544
545 // These FP operations are supported with a single vector instruction for
546 // double (base implementation assumes float generally costs 2). For
547 // FP128, the scalar cost is 1, and there is no overhead since the values
548 // are already in scalar registers.
549 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
550 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
551 switch (ScalarBits) {
552 case 32: {
553 // The vector enhancements facility 1 provides v4f32 instructions.
554 if (ST->hasVectorEnhancements1())
555 return NumVectors;
556 // Return the cost of multiple scalar invocation plus the cost of
557 // inserting and extracting the values.
558 InstructionCost ScalarCost =
560 SmallVector<Type *> Tys(Args.size(), Ty);
562 (VF * ScalarCost) +
563 getScalarizationOverhead(VTy, Args, Tys, CostKind);
564 // FIXME: VF 2 for these FP operations are currently just as
565 // expensive as for VF 4.
566 if (VF == 2)
567 Cost *= 2;
568 return Cost;
569 }
570 case 64:
571 case 128:
572 return NumVectors;
573 default:
574 break;
575 }
576 }
577
578 // There is no native support for FRem.
579 if (Opcode == Instruction::FRem) {
580 SmallVector<Type *> Tys(Args.size(), Ty);
581 InstructionCost Cost = (VF * LIBCALL_COST) +
582 getScalarizationOverhead(VTy, Args, Tys, CostKind);
583 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
584 if (VF == 2 && ScalarBits == 32)
585 Cost *= 2;
586 return Cost;
587 }
588 }
589
590 // Fallback to the default implementation.
591 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
592 Args, CxtI);
593}
594
596 VectorType *Tp,
597 ArrayRef<int> Mask,
599 int Index, VectorType *SubTp,
601 Kind = improveShuffleKindFromMask(Kind, Mask);
602 if (ST->hasVector()) {
603 unsigned NumVectors = getNumVectorRegs(Tp);
604
605 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
606
607 // FP128 values are always in scalar registers, so there is no work
608 // involved with a shuffle, except for broadcast. In that case register
609 // moves are done with a single instruction per element.
610 if (Tp->getScalarType()->isFP128Ty())
611 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
612
613 switch (Kind) {
615 // ExtractSubvector Index indicates start offset.
616
617 // Extracting a subvector from first index is a noop.
618 return (Index == 0 ? 0 : NumVectors);
619
621 // Loop vectorizer calls here to figure out the extra cost of
622 // broadcasting a loaded value to all elements of a vector. Since vlrep
623 // loads and replicates with a single instruction, adjust the returned
624 // value.
625 return NumVectors - 1;
626
627 default:
628
629 // SystemZ supports single instruction permutation / replication.
630 return NumVectors;
631 }
632 }
633
634 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
635}
636
637// Return the log2 difference of the element sizes of the two vector types.
638static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
639 unsigned Bits0 = Ty0->getScalarSizeInBits();
640 unsigned Bits1 = Ty1->getScalarSizeInBits();
641
642 if (Bits1 > Bits0)
643 return (Log2_32(Bits1) - Log2_32(Bits0));
644
645 return (Log2_32(Bits0) - Log2_32(Bits1));
646}
647
648// Return the number of instructions needed to truncate SrcTy to DstTy.
650getVectorTruncCost(Type *SrcTy, Type *DstTy) {
651 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
654 "Packing must reduce size of vector type.");
655 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
656 cast<FixedVectorType>(DstTy)->getNumElements() &&
657 "Packing should not change number of elements.");
658
659 // TODO: Since fp32 is expanded, the extract cost should always be 0.
660
661 unsigned NumParts = getNumVectorRegs(SrcTy);
662 if (NumParts <= 2)
663 // Up to 2 vector registers can be truncated efficiently with pack or
664 // permute. The latter requires an immediate mask to be loaded, which
665 // typically gets hoisted out of a loop. TODO: return a good value for
666 // BB-VECTORIZER that includes the immediate loads, which we do not want
667 // to count for the loop vectorizer.
668 return 1;
669
670 unsigned Cost = 0;
671 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
672 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
673 for (unsigned P = 0; P < Log2Diff; ++P) {
674 if (NumParts > 1)
675 NumParts /= 2;
676 Cost += NumParts;
677 }
678
679 // Currently, a general mix of permutes and pack instructions is output by
680 // isel, which follow the cost computation above except for this case which
681 // is one instruction less:
682 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
683 DstTy->getScalarSizeInBits() == 8)
684 Cost--;
685
686 return Cost;
687}
688
689// Return the cost of converting a vector bitmask produced by a compare
690// (SrcTy), to the type of the select or extend instruction (DstTy).
693 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
694 "Should only be called with vector types.");
695
696 unsigned PackCost = 0;
697 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
698 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
699 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
700 if (SrcScalarBits > DstScalarBits)
701 // The bitmask will be truncated.
702 PackCost = getVectorTruncCost(SrcTy, DstTy);
703 else if (SrcScalarBits < DstScalarBits) {
704 unsigned DstNumParts = getNumVectorRegs(DstTy);
705 // Each vector select needs its part of the bitmask unpacked.
706 PackCost = Log2Diff * DstNumParts;
707 // Extra cost for moving part of mask before unpacking.
708 PackCost += DstNumParts - 1;
709 }
710
711 return PackCost;
712}
713
714// Return the type of the compared operands. This is needed to compute the
715// cost for a Select / ZExt or SExt instruction.
716static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
717 Type *OpTy = nullptr;
718 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
719 OpTy = CI->getOperand(0)->getType();
720 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
721 if (LogicI->getNumOperands() == 2)
722 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
723 if (isa<CmpInst>(LogicI->getOperand(1)))
724 OpTy = CI0->getOperand(0)->getType();
725
726 if (OpTy != nullptr) {
727 if (VF == 1) {
728 assert (!OpTy->isVectorTy() && "Expected scalar type");
729 return OpTy;
730 }
731 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
732 // be either scalar or already vectorized with a same or lesser VF.
733 Type *ElTy = OpTy->getScalarType();
734 return FixedVectorType::get(ElTy, VF);
735 }
736
737 return nullptr;
738}
739
740// Get the cost of converting a boolean vector to a vector with same width
741// and element size as Dst, plus the cost of zero extending if needed.
743getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
744 const Instruction *I) {
745 auto *DstVTy = cast<FixedVectorType>(Dst);
746 unsigned VF = DstVTy->getNumElements();
747 unsigned Cost = 0;
748 // If we know what the widths of the compared operands, get any cost of
749 // converting it to match Dst. Otherwise assume same widths.
750 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
751 if (CmpOpTy != nullptr)
752 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
753 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
754 // One 'vn' per dst vector with an immediate mask.
755 Cost += getNumVectorRegs(Dst);
756 return Cost;
757}
758
760 Type *Src,
763 const Instruction *I) {
764 // FIXME: Can the logic below also be used for these cost kinds?
766 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
767 return BaseCost == 0 ? BaseCost : 1;
768 }
769
770 unsigned DstScalarBits = Dst->getScalarSizeInBits();
771 unsigned SrcScalarBits = Src->getScalarSizeInBits();
772
773 if (!Src->isVectorTy()) {
774 assert (!Dst->isVectorTy());
775
776 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
777 if (SrcScalarBits >= 32 ||
778 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
779 return 1;
780 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
781 }
782
783 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
784 Src->isIntegerTy(1)) {
785 if (ST->hasLoadStoreOnCond2())
786 return 2; // li 0; loc 1
787
788 // This should be extension of a compare i1 result, which is done with
789 // ipm and a varying sequence of instructions.
790 unsigned Cost = 0;
791 if (Opcode == Instruction::SExt)
792 Cost = (DstScalarBits < 64 ? 3 : 4);
793 if (Opcode == Instruction::ZExt)
794 Cost = 3;
795 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
796 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
797 // If operands of an fp-type was compared, this costs +1.
798 Cost++;
799 return Cost;
800 }
801 }
802 else if (ST->hasVector()) {
803 // Vector to scalar cast.
804 auto *SrcVecTy = cast<FixedVectorType>(Src);
805 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
806 if (!DstVecTy) {
807 // TODO: tune vector-to-scalar cast.
808 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
809 }
810 unsigned VF = SrcVecTy->getNumElements();
811 unsigned NumDstVectors = getNumVectorRegs(Dst);
812 unsigned NumSrcVectors = getNumVectorRegs(Src);
813
814 if (Opcode == Instruction::Trunc) {
815 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
816 return 0; // Check for NOOP conversions.
817 return getVectorTruncCost(Src, Dst);
818 }
819
820 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
821 if (SrcScalarBits >= 8) {
822 // ZExt will use either a single unpack or a vector permute.
823 if (Opcode == Instruction::ZExt)
824 return NumDstVectors;
825
826 // SExt will be handled with one unpack per doubling of width.
827 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
828
829 // For types that spans multiple vector registers, some additional
830 // instructions are used to setup the unpacking.
831 unsigned NumSrcVectorOps =
832 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
833 : (NumDstVectors / 2));
834
835 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
836 }
837 else if (SrcScalarBits == 1)
838 return getBoolVecToIntConversionCost(Opcode, Dst, I);
839 }
840
841 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
842 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
843 // TODO: Fix base implementation which could simplify things a bit here
844 // (seems to miss on differentiating on scalar/vector types).
845
846 // Only 64 bit vector conversions are natively supported before z15.
847 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
848 if (SrcScalarBits == DstScalarBits)
849 return NumDstVectors;
850
851 if (SrcScalarBits == 1)
852 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
853 }
854
855 // Return the cost of multiple scalar invocation plus the cost of
856 // inserting and extracting the values. Base implementation does not
857 // realize float->int gets scalarized.
859 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
860 InstructionCost TotCost = VF * ScalarCost;
861 bool NeedsInserts = true, NeedsExtracts = true;
862 // FP128 registers do not get inserted or extracted.
863 if (DstScalarBits == 128 &&
864 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
865 NeedsInserts = false;
866 if (SrcScalarBits == 128 &&
867 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
868 NeedsExtracts = false;
869
870 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
871 NeedsExtracts, CostKind);
872 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
873 /*Extract*/ false, CostKind);
874
875 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
876 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
877 TotCost *= 2;
878
879 return TotCost;
880 }
881
882 if (Opcode == Instruction::FPTrunc) {
883 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
884 return VF /*ldxbr/lexbr*/ +
885 getScalarizationOverhead(DstVecTy, /*Insert*/ true,
886 /*Extract*/ false, CostKind);
887 else // double -> float
888 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
889 }
890
891 if (Opcode == Instruction::FPExt) {
892 if (SrcScalarBits == 32 && DstScalarBits == 64) {
893 // float -> double is very rare and currently unoptimized. Instead of
894 // using vldeb, which can do two at a time, all conversions are
895 // scalarized.
896 return VF * 2;
897 }
898 // -> fp128. VF * lxdb/lxeb + extraction of elements.
899 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
900 /*Extract*/ true, CostKind);
901 }
902 }
903
904 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
905}
906
907// Scalar i8 / i16 operations will typically be made after first extending
908// the operands to i32.
909static unsigned getOperandsExtensionCost(const Instruction *I) {
910 unsigned ExtCost = 0;
911 for (Value *Op : I->operands())
912 // A load of i8 or i16 sign/zero extends to i32.
913 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
914 ExtCost++;
915
916 return ExtCost;
917}
918
920 Type *CondTy,
921 CmpInst::Predicate VecPred,
923 const Instruction *I) {
925 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
926
927 if (!ValTy->isVectorTy()) {
928 switch (Opcode) {
929 case Instruction::ICmp: {
930 // A loaded value compared with 0 with multiple users becomes Load and
931 // Test. The load is then not foldable, so return 0 cost for the ICmp.
932 unsigned ScalarBits = ValTy->getScalarSizeInBits();
933 if (I != nullptr && ScalarBits >= 32)
934 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
935 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
936 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
937 C->isZero())
938 return 0;
939
940 unsigned Cost = 1;
941 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
942 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
943 return Cost;
944 }
945 case Instruction::Select:
946 if (ValTy->isFloatingPointTy())
947 return 4; // No load on condition for FP - costs a conditional jump.
948 return 1; // Load On Condition / Select Register.
949 }
950 }
951 else if (ST->hasVector()) {
952 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
953
954 // Called with a compare instruction.
955 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
956 unsigned PredicateExtraCost = 0;
957 if (I != nullptr) {
958 // Some predicates cost one or two extra instructions.
959 switch (cast<CmpInst>(I)->getPredicate()) {
965 PredicateExtraCost = 1;
966 break;
971 PredicateExtraCost = 2;
972 break;
973 default:
974 break;
975 }
976 }
977
978 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
979 // floats. FIXME: <2 x float> generates same code as <4 x float>.
980 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
981 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
982
983 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
984 return Cost;
985 }
986 else { // Called with a select instruction.
987 assert (Opcode == Instruction::Select);
988
989 // We can figure out the extra cost of packing / unpacking if the
990 // instruction was passed and the compare instruction is found.
991 unsigned PackCost = 0;
992 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
993 if (CmpOpTy != nullptr)
994 PackCost =
995 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
996
997 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
998 }
999 }
1000
1001 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1002}
1003
1006 unsigned Index, Value *Op0,
1007 Value *Op1) {
1008 // vlvgp will insert two grs into a vector register, so only count half the
1009 // number of instructions.
1010 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1011 return ((Index % 2 == 0) ? 1 : 0);
1012
1013 if (Opcode == Instruction::ExtractElement) {
1014 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1015
1016 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1017 if (Index == 0 && Val->isIntOrIntVectorTy())
1018 Cost += 1;
1019
1020 return Cost;
1021 }
1022
1023 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1024}
1025
1026// Check if a load may be folded as a memory operand in its user.
1028isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1029 if (!Ld->hasOneUse())
1030 return false;
1031 FoldedValue = Ld;
1032 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1033 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1034 unsigned TruncBits = 0;
1035 unsigned SExtBits = 0;
1036 unsigned ZExtBits = 0;
1037 if (UserI->hasOneUse()) {
1038 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1039 if (isa<TruncInst>(UserI))
1040 TruncBits = UserBits;
1041 else if (isa<SExtInst>(UserI))
1042 SExtBits = UserBits;
1043 else if (isa<ZExtInst>(UserI))
1044 ZExtBits = UserBits;
1045 }
1046 if (TruncBits || SExtBits || ZExtBits) {
1047 FoldedValue = UserI;
1048 UserI = cast<Instruction>(*UserI->user_begin());
1049 // Load (single use) -> trunc/extend (single use) -> UserI
1050 }
1051 if ((UserI->getOpcode() == Instruction::Sub ||
1052 UserI->getOpcode() == Instruction::SDiv ||
1053 UserI->getOpcode() == Instruction::UDiv) &&
1054 UserI->getOperand(1) != FoldedValue)
1055 return false; // Not commutative, only RHS foldable.
1056 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1057 // extension was made of the load.
1058 unsigned LoadOrTruncBits =
1059 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1060 switch (UserI->getOpcode()) {
1061 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1062 case Instruction::Sub:
1063 case Instruction::ICmp:
1064 if (LoadedBits == 32 && ZExtBits == 64)
1065 return true;
1066 [[fallthrough]];
1067 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1068 if (UserI->getOpcode() != Instruction::ICmp) {
1069 if (LoadedBits == 16 &&
1070 (SExtBits == 32 ||
1071 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1072 return true;
1073 if (LoadOrTruncBits == 16)
1074 return true;
1075 }
1076 [[fallthrough]];
1077 case Instruction::SDiv:// SE: 32->64
1078 if (LoadedBits == 32 && SExtBits == 64)
1079 return true;
1080 [[fallthrough]];
1081 case Instruction::UDiv:
1082 case Instruction::And:
1083 case Instruction::Or:
1084 case Instruction::Xor:
1085 // This also makes sense for float operations, but disabled for now due
1086 // to regressions.
1087 // case Instruction::FCmp:
1088 // case Instruction::FAdd:
1089 // case Instruction::FSub:
1090 // case Instruction::FMul:
1091 // case Instruction::FDiv:
1092
1093 // All possible extensions of memory checked above.
1094
1095 // Comparison between memory and immediate.
1096 if (UserI->getOpcode() == Instruction::ICmp)
1097 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1098 if (CI->getValue().isIntN(16))
1099 return true;
1100 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1101 break;
1102 }
1103 return false;
1104}
1105
1106static bool isBswapIntrinsicCall(const Value *V) {
1107 if (const Instruction *I = dyn_cast<Instruction>(V))
1108 if (auto *CI = dyn_cast<CallInst>(I))
1109 if (auto *F = CI->getCalledFunction())
1110 if (F->getIntrinsicID() == Intrinsic::bswap)
1111 return true;
1112 return false;
1113}
1114
1116 MaybeAlign Alignment,
1117 unsigned AddressSpace,
1119 TTI::OperandValueInfo OpInfo,
1120 const Instruction *I) {
1121 assert(!Src->isVoidTy() && "Invalid type");
1122
1123 // TODO: Handle other cost kinds.
1125 return 1;
1126
1127 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1128 // Store the load or its truncated or extended value in FoldedValue.
1129 const Instruction *FoldedValue = nullptr;
1130 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1131 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1132 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1133
1134 // UserI can't fold two loads, so in that case return 0 cost only
1135 // half of the time.
1136 for (unsigned i = 0; i < 2; ++i) {
1137 if (UserI->getOperand(i) == FoldedValue)
1138 continue;
1139
1140 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1141 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1142 if (!OtherLoad &&
1143 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1144 isa<ZExtInst>(OtherOp)))
1145 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1146 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1147 return i == 0; // Both operands foldable.
1148 }
1149 }
1150
1151 return 0; // Only I is foldable in user.
1152 }
1153 }
1154
1155 unsigned NumOps =
1156 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1157
1158 // Store/Load reversed saves one instruction.
1159 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1160 I != nullptr) {
1161 if (Opcode == Instruction::Load && I->hasOneUse()) {
1162 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1163 // In case of load -> bswap -> store, return normal cost for the load.
1164 if (isBswapIntrinsicCall(LdUser) &&
1165 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1166 return 0;
1167 }
1168 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1169 const Value *StoredVal = SI->getValueOperand();
1170 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1171 return 0;
1172 }
1173 }
1174
1175 if (Src->getScalarSizeInBits() == 128)
1176 // 128 bit scalars are held in a pair of two 64 bit registers.
1177 NumOps *= 2;
1178
1179 return NumOps;
1180}
1181
1182// The generic implementation of getInterleavedMemoryOpCost() is based on
1183// adding costs of the memory operations plus all the extracts and inserts
1184// needed for using / defining the vector operands. The SystemZ version does
1185// roughly the same but bases the computations on vector permutations
1186// instead.
1188 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1189 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1190 bool UseMaskForCond, bool UseMaskForGaps) {
1191 if (UseMaskForCond || UseMaskForGaps)
1192 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1193 Alignment, AddressSpace, CostKind,
1194 UseMaskForCond, UseMaskForGaps);
1195 assert(isa<VectorType>(VecTy) &&
1196 "Expect a vector type for interleaved memory op");
1197
1198 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1199 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1200 unsigned VF = NumElts / Factor;
1201 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1202 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1203 unsigned NumPermutes = 0;
1204
1205 if (Opcode == Instruction::Load) {
1206 // Loading interleave groups may have gaps, which may mean fewer
1207 // loads. Find out how many vectors will be loaded in total, and in how
1208 // many of them each value will be in.
1209 BitVector UsedInsts(NumVectorMemOps, false);
1210 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1211 for (unsigned Index : Indices)
1212 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1213 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1214 UsedInsts.set(Vec);
1215 ValueVecs[Index].set(Vec);
1216 }
1217 NumVectorMemOps = UsedInsts.count();
1218
1219 for (unsigned Index : Indices) {
1220 // Estimate that each loaded source vector containing this Index
1221 // requires one operation, except that vperm can handle two input
1222 // registers first time for each dst vector.
1223 unsigned NumSrcVecs = ValueVecs[Index].count();
1224 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1225 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1226 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1227 }
1228 } else {
1229 // Estimate the permutes for each stored vector as the smaller of the
1230 // number of elements and the number of source vectors. Subtract one per
1231 // dst vector for vperm (S.A.).
1232 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1233 unsigned NumDstVecs = NumVectorMemOps;
1234 assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1235 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1236 }
1237
1238 // Cost of load/store operations and the permutations needed.
1239 return NumVectorMemOps + NumPermutes;
1240}
1241
1243 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1244 return getNumVectorRegs(RetTy); // VPERM
1245 return -1;
1246}
1247
1253 if (Cost != -1)
1254 return Cost;
1256}
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
#define P(N)
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:441
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:432
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:849
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:963
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:720
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:927
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:993
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:155
BitVector & set()
Definition: BitVector.h:344
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1184
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1406
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:708
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:748
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:742
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:726
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:729
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:727
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:746
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:744
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:728
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:132
This is an important base class in LLVM.
Definition: Constant.h:41
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:698
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:168
An instruction for reading from memory.
Definition: Instructions.h:177
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getNumberOfRegisters(unsigned ClassID) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimunSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:258
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:228
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:255
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:222
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:341
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
iterator_range< user_iterator > users()
Definition: Value.h:421
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:508
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:373
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...