LLVM 19.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/Support/Debug.h"
23using namespace llvm;
24
25#define DEBUG_TYPE "systemztti"
26
27//===----------------------------------------------------------------------===//
28//
29// SystemZ cost model.
30//
31//===----------------------------------------------------------------------===//
32
33static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
34 bool UsedAsMemCpySource = false;
35 for (const User *U : V->users())
36 if (const Instruction *User = dyn_cast<Instruction>(U)) {
37 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
38 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
39 continue;
40 }
41 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
42 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
43 UsedAsMemCpySource = true;
44 continue;
45 }
46 }
47 OtherUse = true;
48 }
49 return UsedAsMemCpySource;
50}
51
53 unsigned Bonus = 0;
54
55 // Increase the threshold if an incoming argument is used only as a memcpy
56 // source.
57 if (Function *Callee = CB->getCalledFunction())
58 for (Argument &Arg : Callee->args()) {
59 bool OtherUse = false;
60 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
61 Bonus += 150;
62 }
63
64 LLVM_DEBUG(if (Bonus)
65 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
66 return Bonus;
67}
68
71 assert(Ty->isIntegerTy());
72
73 unsigned BitSize = Ty->getPrimitiveSizeInBits();
74 // There is no cost model for constants with a bit size of 0. Return TCC_Free
75 // here, so that constant hoisting will ignore this constant.
76 if (BitSize == 0)
77 return TTI::TCC_Free;
78 // No cost model for operations on integers larger than 128 bit implemented yet.
79 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
80 return TTI::TCC_Free;
81
82 if (Imm == 0)
83 return TTI::TCC_Free;
84
85 if (Imm.getBitWidth() <= 64) {
86 // Constants loaded via lgfi.
87 if (isInt<32>(Imm.getSExtValue()))
88 return TTI::TCC_Basic;
89 // Constants loaded via llilf.
90 if (isUInt<32>(Imm.getZExtValue()))
91 return TTI::TCC_Basic;
92 // Constants loaded via llihf:
93 if ((Imm.getZExtValue() & 0xffffffff) == 0)
94 return TTI::TCC_Basic;
95
96 return 2 * TTI::TCC_Basic;
97 }
98
99 // i128 immediates loads from Constant Pool
100 return 2 * TTI::TCC_Basic;
101}
102
104 const APInt &Imm, Type *Ty,
106 Instruction *Inst) {
107 assert(Ty->isIntegerTy());
108
109 unsigned BitSize = Ty->getPrimitiveSizeInBits();
110 // There is no cost model for constants with a bit size of 0. Return TCC_Free
111 // here, so that constant hoisting will ignore this constant.
112 if (BitSize == 0)
113 return TTI::TCC_Free;
114 // No cost model for operations on integers larger than 64 bit implemented yet.
115 if (BitSize > 64)
116 return TTI::TCC_Free;
117
118 switch (Opcode) {
119 default:
120 return TTI::TCC_Free;
121 case Instruction::GetElementPtr:
122 // Always hoist the base address of a GetElementPtr. This prevents the
123 // creation of new constants for every base constant that gets constant
124 // folded with the offset.
125 if (Idx == 0)
126 return 2 * TTI::TCC_Basic;
127 return TTI::TCC_Free;
128 case Instruction::Store:
129 if (Idx == 0 && Imm.getBitWidth() <= 64) {
130 // Any 8-bit immediate store can by implemented via mvi.
131 if (BitSize == 8)
132 return TTI::TCC_Free;
133 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
134 if (isInt<16>(Imm.getSExtValue()))
135 return TTI::TCC_Free;
136 }
137 break;
138 case Instruction::ICmp:
139 if (Idx == 1 && Imm.getBitWidth() <= 64) {
140 // Comparisons against signed 32-bit immediates implemented via cgfi.
141 if (isInt<32>(Imm.getSExtValue()))
142 return TTI::TCC_Free;
143 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
144 if (isUInt<32>(Imm.getZExtValue()))
145 return TTI::TCC_Free;
146 }
147 break;
148 case Instruction::Add:
149 case Instruction::Sub:
150 if (Idx == 1 && Imm.getBitWidth() <= 64) {
151 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
152 if (isUInt<32>(Imm.getZExtValue()))
153 return TTI::TCC_Free;
154 // Or their negation, by swapping addition vs. subtraction.
155 if (isUInt<32>(-Imm.getSExtValue()))
156 return TTI::TCC_Free;
157 }
158 break;
159 case Instruction::Mul:
160 if (Idx == 1 && Imm.getBitWidth() <= 64) {
161 // We use msgfi to multiply by 32-bit signed immediates.
162 if (isInt<32>(Imm.getSExtValue()))
163 return TTI::TCC_Free;
164 }
165 break;
166 case Instruction::Or:
167 case Instruction::Xor:
168 if (Idx == 1 && Imm.getBitWidth() <= 64) {
169 // Masks supported by oilf/xilf.
170 if (isUInt<32>(Imm.getZExtValue()))
171 return TTI::TCC_Free;
172 // Masks supported by oihf/xihf.
173 if ((Imm.getZExtValue() & 0xffffffff) == 0)
174 return TTI::TCC_Free;
175 }
176 break;
177 case Instruction::And:
178 if (Idx == 1 && Imm.getBitWidth() <= 64) {
179 // Any 32-bit AND operation can by implemented via nilf.
180 if (BitSize <= 32)
181 return TTI::TCC_Free;
182 // 64-bit masks supported by nilf.
183 if (isUInt<32>(~Imm.getZExtValue()))
184 return TTI::TCC_Free;
185 // 64-bit masks supported by nilh.
186 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
187 return TTI::TCC_Free;
188 // Some 64-bit AND operations can be implemented via risbg.
189 const SystemZInstrInfo *TII = ST->getInstrInfo();
190 unsigned Start, End;
191 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
192 return TTI::TCC_Free;
193 }
194 break;
195 case Instruction::Shl:
196 case Instruction::LShr:
197 case Instruction::AShr:
198 // Always return TCC_Free for the shift value of a shift instruction.
199 if (Idx == 1)
200 return TTI::TCC_Free;
201 break;
202 case Instruction::UDiv:
203 case Instruction::SDiv:
204 case Instruction::URem:
205 case Instruction::SRem:
206 case Instruction::Trunc:
207 case Instruction::ZExt:
208 case Instruction::SExt:
209 case Instruction::IntToPtr:
210 case Instruction::PtrToInt:
211 case Instruction::BitCast:
212 case Instruction::PHI:
213 case Instruction::Call:
214 case Instruction::Select:
215 case Instruction::Ret:
216 case Instruction::Load:
217 break;
218 }
219
221}
222
225 const APInt &Imm, Type *Ty,
227 assert(Ty->isIntegerTy());
228
229 unsigned BitSize = Ty->getPrimitiveSizeInBits();
230 // There is no cost model for constants with a bit size of 0. Return TCC_Free
231 // here, so that constant hoisting will ignore this constant.
232 if (BitSize == 0)
233 return TTI::TCC_Free;
234 // No cost model for operations on integers larger than 64 bit implemented yet.
235 if (BitSize > 64)
236 return TTI::TCC_Free;
237
238 switch (IID) {
239 default:
240 return TTI::TCC_Free;
241 case Intrinsic::sadd_with_overflow:
242 case Intrinsic::uadd_with_overflow:
243 case Intrinsic::ssub_with_overflow:
244 case Intrinsic::usub_with_overflow:
245 // These get expanded to include a normal addition/subtraction.
246 if (Idx == 1 && Imm.getBitWidth() <= 64) {
247 if (isUInt<32>(Imm.getZExtValue()))
248 return TTI::TCC_Free;
249 if (isUInt<32>(-Imm.getSExtValue()))
250 return TTI::TCC_Free;
251 }
252 break;
253 case Intrinsic::smul_with_overflow:
254 case Intrinsic::umul_with_overflow:
255 // These get expanded to include a normal multiplication.
256 if (Idx == 1 && Imm.getBitWidth() <= 64) {
257 if (isInt<32>(Imm.getSExtValue()))
258 return TTI::TCC_Free;
259 }
260 break;
261 case Intrinsic::experimental_stackmap:
262 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
263 return TTI::TCC_Free;
264 break;
265 case Intrinsic::experimental_patchpoint_void:
266 case Intrinsic::experimental_patchpoint_i64:
267 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
268 return TTI::TCC_Free;
269 break;
270 }
272}
273
276 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
277 if (ST->hasPopulationCount() && TyWidth <= 64)
279 return TTI::PSK_Software;
280}
281
285 // Find out if L contains a call, what the machine instruction count
286 // estimate is, and how many stores there are.
287 bool HasCall = false;
288 InstructionCost NumStores = 0;
289 for (auto &BB : L->blocks())
290 for (auto &I : *BB) {
291 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
292 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
293 if (isLoweredToCall(F))
294 HasCall = true;
295 if (F->getIntrinsicID() == Intrinsic::memcpy ||
296 F->getIntrinsicID() == Intrinsic::memset)
297 NumStores++;
298 } else { // indirect call.
299 HasCall = true;
300 }
301 }
302 if (isa<StoreInst>(&I)) {
303 Type *MemAccessTy = I.getOperand(0)->getType();
304 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
305 std::nullopt, 0, TTI::TCK_RecipThroughput);
306 }
307 }
308
309 // The z13 processor will run out of store tags if too many stores
310 // are fed into it too quickly. Therefore make sure there are not
311 // too many stores in the resulting unrolled loop.
312 unsigned const NumStoresVal = *NumStores.getValue();
313 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
314
315 if (HasCall) {
316 // Only allow full unrolling if loop has any calls.
317 UP.FullUnrollMaxCount = Max;
318 UP.MaxCount = 1;
319 return;
320 }
321
322 UP.MaxCount = Max;
323 if (UP.MaxCount <= 1)
324 return;
325
326 // Allow partial and runtime trip count unrolling.
327 UP.Partial = UP.Runtime = true;
328
329 UP.PartialThreshold = 75;
331
332 // Allow expensive instructions in the pre-header of the loop.
333 UP.AllowExpensiveTripCount = true;
334
335 UP.Force = true;
336}
337
341}
342
345 // SystemZ specific: check instruction count (first), and don't care about
346 // ImmCost, since offsets are checked explicitly.
347 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
348 C1.NumIVMuls, C1.NumBaseAdds,
349 C1.ScaleCost, C1.SetupCost) <
350 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
351 C2.NumIVMuls, C2.NumBaseAdds,
352 C2.ScaleCost, C2.SetupCost);
353}
354
355unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
356 bool Vector = (ClassID == 1);
357 if (!Vector)
358 // Discount the stack pointer. Also leave out %r0, since it can't
359 // be used in an address.
360 return 14;
361 if (ST->hasVector())
362 return 32;
363 return 0;
364}
365
368 switch (K) {
370 return TypeSize::getFixed(64);
372 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
374 return TypeSize::getScalable(0);
375 }
376
377 llvm_unreachable("Unsupported register kind");
378}
379
380unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
381 unsigned NumStridedMemAccesses,
382 unsigned NumPrefetches,
383 bool HasCall) const {
384 // Don't prefetch a loop with many far apart accesses.
385 if (NumPrefetches > 16)
386 return UINT_MAX;
387
388 // Emit prefetch instructions for smaller strides in cases where we think
389 // the hardware prefetcher might not be able to keep up.
390 if (NumStridedMemAccesses > 32 && !HasCall &&
391 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
392 return 1;
393
394 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
395}
396
397bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
398 EVT VT = TLI->getValueType(DL, DataType);
399 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
400}
401
402// Return the bit size for the scalar type or vector element
403// type. getScalarSizeInBits() returns 0 for a pointer type.
404static unsigned getScalarSizeInBits(Type *Ty) {
405 unsigned Size =
406 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
407 assert(Size > 0 && "Element must have non-zero size.");
408 return Size;
409}
410
411// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
412// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
413// 3.
414static unsigned getNumVectorRegs(Type *Ty) {
415 auto *VTy = cast<FixedVectorType>(Ty);
416 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
417 assert(WideBits > 0 && "Could not compute size of vector");
418 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
419}
420
422 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
425 const Instruction *CxtI) {
426
427 // TODO: Handle more cost kinds.
429 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
430 Op2Info, Args, CxtI);
431
432 // TODO: return a good value for BB-VECTORIZER that includes the
433 // immediate loads, which we do not want to count for the loop
434 // vectorizer, since they are hopefully hoisted out of the loop. This
435 // would require a new parameter 'InLoop', but not sure if constant
436 // args are common enough to motivate this.
437
438 unsigned ScalarBits = Ty->getScalarSizeInBits();
439
440 // There are thre cases of division and remainder: Dividing with a register
441 // needs a divide instruction. A divisor which is a power of two constant
442 // can be implemented with a sequence of shifts. Any other constant needs a
443 // multiply and shifts.
444 const unsigned DivInstrCost = 20;
445 const unsigned DivMulSeqCost = 10;
446 const unsigned SDivPow2Cost = 4;
447
448 bool SignedDivRem =
449 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
450 bool UnsignedDivRem =
451 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
452
453 // Check for a constant divisor.
454 bool DivRemConst = false;
455 bool DivRemConstPow2 = false;
456 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
457 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
458 const ConstantInt *CVal =
459 (C->getType()->isVectorTy()
460 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
461 : dyn_cast<const ConstantInt>(C));
462 if (CVal && (CVal->getValue().isPowerOf2() ||
463 CVal->getValue().isNegatedPowerOf2()))
464 DivRemConstPow2 = true;
465 else
466 DivRemConst = true;
467 }
468 }
469
470 if (!Ty->isVectorTy()) {
471 // These FP operations are supported with a dedicated instruction for
472 // float, double and fp128 (base implementation assumes float generally
473 // costs 2).
474 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
475 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
476 return 1;
477
478 // There is no native support for FRem.
479 if (Opcode == Instruction::FRem)
480 return LIBCALL_COST;
481
482 // Give discount for some combined logical operations if supported.
483 if (Args.size() == 2) {
484 if (Opcode == Instruction::Xor) {
485 for (const Value *A : Args) {
486 if (const Instruction *I = dyn_cast<Instruction>(A))
487 if (I->hasOneUse() &&
488 (I->getOpcode() == Instruction::Or ||
489 I->getOpcode() == Instruction::And ||
490 I->getOpcode() == Instruction::Xor))
491 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
492 (isInt128InVR(Ty) &&
493 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
494 return 0;
495 }
496 }
497 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
498 for (const Value *A : Args) {
499 if (const Instruction *I = dyn_cast<Instruction>(A))
500 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
501 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
502 (isInt128InVR(Ty) &&
503 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
504 return 0;
505 }
506 }
507 }
508
509 // Or requires one instruction, although it has custom handling for i64.
510 if (Opcode == Instruction::Or)
511 return 1;
512
513 if (Opcode == Instruction::Xor && ScalarBits == 1) {
514 if (ST->hasLoadStoreOnCond2())
515 return 5; // 2 * (li 0; loc 1); xor
516 return 7; // 2 * ipm sequences ; xor ; shift ; compare
517 }
518
519 if (DivRemConstPow2)
520 return (SignedDivRem ? SDivPow2Cost : 1);
521 if (DivRemConst)
522 return DivMulSeqCost;
523 if (SignedDivRem || UnsignedDivRem)
524 return DivInstrCost;
525 }
526 else if (ST->hasVector()) {
527 auto *VTy = cast<FixedVectorType>(Ty);
528 unsigned VF = VTy->getNumElements();
529 unsigned NumVectors = getNumVectorRegs(Ty);
530
531 // These vector operations are custom handled, but are still supported
532 // with one instruction per vector, regardless of element size.
533 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
534 Opcode == Instruction::AShr) {
535 return NumVectors;
536 }
537
538 if (DivRemConstPow2)
539 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
540 if (DivRemConst) {
541 SmallVector<Type *> Tys(Args.size(), Ty);
542 return VF * DivMulSeqCost +
543 getScalarizationOverhead(VTy, Args, Tys, CostKind);
544 }
545 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
546 // Temporary hack: disable high vectorization factors with integer
547 // division/remainder, which will get scalarized and handled with
548 // GR128 registers. The mischeduler is not clever enough to avoid
549 // spilling yet.
550 return 1000;
551
552 // These FP operations are supported with a single vector instruction for
553 // double (base implementation assumes float generally costs 2). For
554 // FP128, the scalar cost is 1, and there is no overhead since the values
555 // are already in scalar registers.
556 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
557 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
558 switch (ScalarBits) {
559 case 32: {
560 // The vector enhancements facility 1 provides v4f32 instructions.
561 if (ST->hasVectorEnhancements1())
562 return NumVectors;
563 // Return the cost of multiple scalar invocation plus the cost of
564 // inserting and extracting the values.
565 InstructionCost ScalarCost =
567 SmallVector<Type *> Tys(Args.size(), Ty);
569 (VF * ScalarCost) +
570 getScalarizationOverhead(VTy, Args, Tys, CostKind);
571 // FIXME: VF 2 for these FP operations are currently just as
572 // expensive as for VF 4.
573 if (VF == 2)
574 Cost *= 2;
575 return Cost;
576 }
577 case 64:
578 case 128:
579 return NumVectors;
580 default:
581 break;
582 }
583 }
584
585 // There is no native support for FRem.
586 if (Opcode == Instruction::FRem) {
587 SmallVector<Type *> Tys(Args.size(), Ty);
588 InstructionCost Cost = (VF * LIBCALL_COST) +
589 getScalarizationOverhead(VTy, Args, Tys, CostKind);
590 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
591 if (VF == 2 && ScalarBits == 32)
592 Cost *= 2;
593 return Cost;
594 }
595 }
596
597 // Fallback to the default implementation.
598 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
599 Args, CxtI);
600}
601
603 VectorType *Tp,
604 ArrayRef<int> Mask,
606 int Index, VectorType *SubTp,
608 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
609 if (ST->hasVector()) {
610 unsigned NumVectors = getNumVectorRegs(Tp);
611
612 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
613
614 // FP128 values are always in scalar registers, so there is no work
615 // involved with a shuffle, except for broadcast. In that case register
616 // moves are done with a single instruction per element.
617 if (Tp->getScalarType()->isFP128Ty())
618 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
619
620 switch (Kind) {
622 // ExtractSubvector Index indicates start offset.
623
624 // Extracting a subvector from first index is a noop.
625 return (Index == 0 ? 0 : NumVectors);
626
628 // Loop vectorizer calls here to figure out the extra cost of
629 // broadcasting a loaded value to all elements of a vector. Since vlrep
630 // loads and replicates with a single instruction, adjust the returned
631 // value.
632 return NumVectors - 1;
633
634 default:
635
636 // SystemZ supports single instruction permutation / replication.
637 return NumVectors;
638 }
639 }
640
641 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
642}
643
644// Return the log2 difference of the element sizes of the two vector types.
645static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
646 unsigned Bits0 = Ty0->getScalarSizeInBits();
647 unsigned Bits1 = Ty1->getScalarSizeInBits();
648
649 if (Bits1 > Bits0)
650 return (Log2_32(Bits1) - Log2_32(Bits0));
651
652 return (Log2_32(Bits0) - Log2_32(Bits1));
653}
654
655// Return the number of instructions needed to truncate SrcTy to DstTy.
657getVectorTruncCost(Type *SrcTy, Type *DstTy) {
658 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
661 "Packing must reduce size of vector type.");
662 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
663 cast<FixedVectorType>(DstTy)->getNumElements() &&
664 "Packing should not change number of elements.");
665
666 // TODO: Since fp32 is expanded, the extract cost should always be 0.
667
668 unsigned NumParts = getNumVectorRegs(SrcTy);
669 if (NumParts <= 2)
670 // Up to 2 vector registers can be truncated efficiently with pack or
671 // permute. The latter requires an immediate mask to be loaded, which
672 // typically gets hoisted out of a loop. TODO: return a good value for
673 // BB-VECTORIZER that includes the immediate loads, which we do not want
674 // to count for the loop vectorizer.
675 return 1;
676
677 unsigned Cost = 0;
678 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
679 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
680 for (unsigned P = 0; P < Log2Diff; ++P) {
681 if (NumParts > 1)
682 NumParts /= 2;
683 Cost += NumParts;
684 }
685
686 // Currently, a general mix of permutes and pack instructions is output by
687 // isel, which follow the cost computation above except for this case which
688 // is one instruction less:
689 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
690 DstTy->getScalarSizeInBits() == 8)
691 Cost--;
692
693 return Cost;
694}
695
696// Return the cost of converting a vector bitmask produced by a compare
697// (SrcTy), to the type of the select or extend instruction (DstTy).
700 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
701 "Should only be called with vector types.");
702
703 unsigned PackCost = 0;
704 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
705 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
706 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
707 if (SrcScalarBits > DstScalarBits)
708 // The bitmask will be truncated.
709 PackCost = getVectorTruncCost(SrcTy, DstTy);
710 else if (SrcScalarBits < DstScalarBits) {
711 unsigned DstNumParts = getNumVectorRegs(DstTy);
712 // Each vector select needs its part of the bitmask unpacked.
713 PackCost = Log2Diff * DstNumParts;
714 // Extra cost for moving part of mask before unpacking.
715 PackCost += DstNumParts - 1;
716 }
717
718 return PackCost;
719}
720
721// Return the type of the compared operands. This is needed to compute the
722// cost for a Select / ZExt or SExt instruction.
723static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
724 Type *OpTy = nullptr;
725 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
726 OpTy = CI->getOperand(0)->getType();
727 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
728 if (LogicI->getNumOperands() == 2)
729 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
730 if (isa<CmpInst>(LogicI->getOperand(1)))
731 OpTy = CI0->getOperand(0)->getType();
732
733 if (OpTy != nullptr) {
734 if (VF == 1) {
735 assert (!OpTy->isVectorTy() && "Expected scalar type");
736 return OpTy;
737 }
738 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
739 // be either scalar or already vectorized with a same or lesser VF.
740 Type *ElTy = OpTy->getScalarType();
741 return FixedVectorType::get(ElTy, VF);
742 }
743
744 return nullptr;
745}
746
747// Get the cost of converting a boolean vector to a vector with same width
748// and element size as Dst, plus the cost of zero extending if needed.
750getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
751 const Instruction *I) {
752 auto *DstVTy = cast<FixedVectorType>(Dst);
753 unsigned VF = DstVTy->getNumElements();
754 unsigned Cost = 0;
755 // If we know what the widths of the compared operands, get any cost of
756 // converting it to match Dst. Otherwise assume same widths.
757 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
758 if (CmpOpTy != nullptr)
759 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
760 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
761 // One 'vn' per dst vector with an immediate mask.
762 Cost += getNumVectorRegs(Dst);
763 return Cost;
764}
765
767 Type *Src,
770 const Instruction *I) {
771 // FIXME: Can the logic below also be used for these cost kinds?
773 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
774 return BaseCost == 0 ? BaseCost : 1;
775 }
776
777 unsigned DstScalarBits = Dst->getScalarSizeInBits();
778 unsigned SrcScalarBits = Src->getScalarSizeInBits();
779
780 if (!Src->isVectorTy()) {
781 assert (!Dst->isVectorTy());
782
783 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
784 if (Src->isIntegerTy(128))
785 return LIBCALL_COST;
786 if (SrcScalarBits >= 32 ||
787 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
788 return 1;
789 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
790 }
791
792 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
793 Dst->isIntegerTy(128))
794 return LIBCALL_COST;
795
796 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
797 if (Src->isIntegerTy(1)) {
798 if (DstScalarBits == 128)
799 return 5 /*branch seq.*/;
800
801 if (ST->hasLoadStoreOnCond2())
802 return 2; // li 0; loc 1
803
804 // This should be extension of a compare i1 result, which is done with
805 // ipm and a varying sequence of instructions.
806 unsigned Cost = 0;
807 if (Opcode == Instruction::SExt)
808 Cost = (DstScalarBits < 64 ? 3 : 4);
809 if (Opcode == Instruction::ZExt)
810 Cost = 3;
811 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
812 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
813 // If operands of an fp-type was compared, this costs +1.
814 Cost++;
815 return Cost;
816 }
817 else if (isInt128InVR(Dst)) {
818 // Extensions from GPR to i128 (in VR) typically costs two instructions,
819 // but a zero-extending load would be just one extra instruction.
820 if (Opcode == Instruction::ZExt && I != nullptr)
821 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
822 if (Ld->hasOneUse())
823 return 1;
824 return 2;
825 }
826 }
827
828 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
829 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
830 if (Ld->hasOneUse())
831 return 0; // Will be converted to GPR load.
832 bool OnlyTruncatingStores = true;
833 for (const User *U : I->users())
834 if (!isa<StoreInst>(U)) {
835 OnlyTruncatingStores = false;
836 break;
837 }
838 if (OnlyTruncatingStores)
839 return 0;
840 return 2; // Vector element extraction.
841 }
842 }
843 else if (ST->hasVector()) {
844 // Vector to scalar cast.
845 auto *SrcVecTy = cast<FixedVectorType>(Src);
846 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
847 if (!DstVecTy) {
848 // TODO: tune vector-to-scalar cast.
849 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
850 }
851 unsigned VF = SrcVecTy->getNumElements();
852 unsigned NumDstVectors = getNumVectorRegs(Dst);
853 unsigned NumSrcVectors = getNumVectorRegs(Src);
854
855 if (Opcode == Instruction::Trunc) {
856 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
857 return 0; // Check for NOOP conversions.
858 return getVectorTruncCost(Src, Dst);
859 }
860
861 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
862 if (SrcScalarBits >= 8) {
863 // ZExt will use either a single unpack or a vector permute.
864 if (Opcode == Instruction::ZExt)
865 return NumDstVectors;
866
867 // SExt will be handled with one unpack per doubling of width.
868 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
869
870 // For types that spans multiple vector registers, some additional
871 // instructions are used to setup the unpacking.
872 unsigned NumSrcVectorOps =
873 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
874 : (NumDstVectors / 2));
875
876 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
877 }
878 else if (SrcScalarBits == 1)
879 return getBoolVecToIntConversionCost(Opcode, Dst, I);
880 }
881
882 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
883 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
884 // TODO: Fix base implementation which could simplify things a bit here
885 // (seems to miss on differentiating on scalar/vector types).
886
887 // Only 64 bit vector conversions are natively supported before z15.
888 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
889 if (SrcScalarBits == DstScalarBits)
890 return NumDstVectors;
891
892 if (SrcScalarBits == 1)
893 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
894 }
895
896 // Return the cost of multiple scalar invocation plus the cost of
897 // inserting and extracting the values. Base implementation does not
898 // realize float->int gets scalarized.
900 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
901 InstructionCost TotCost = VF * ScalarCost;
902 bool NeedsInserts = true, NeedsExtracts = true;
903 // FP128 registers do not get inserted or extracted.
904 if (DstScalarBits == 128 &&
905 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
906 NeedsInserts = false;
907 if (SrcScalarBits == 128 &&
908 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
909 NeedsExtracts = false;
910
911 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
912 NeedsExtracts, CostKind);
913 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
914 /*Extract*/ false, CostKind);
915
916 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
917 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
918 TotCost *= 2;
919
920 return TotCost;
921 }
922
923 if (Opcode == Instruction::FPTrunc) {
924 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
925 return VF /*ldxbr/lexbr*/ +
926 getScalarizationOverhead(DstVecTy, /*Insert*/ true,
927 /*Extract*/ false, CostKind);
928 else // double -> float
929 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
930 }
931
932 if (Opcode == Instruction::FPExt) {
933 if (SrcScalarBits == 32 && DstScalarBits == 64) {
934 // float -> double is very rare and currently unoptimized. Instead of
935 // using vldeb, which can do two at a time, all conversions are
936 // scalarized.
937 return VF * 2;
938 }
939 // -> fp128. VF * lxdb/lxeb + extraction of elements.
940 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
941 /*Extract*/ true, CostKind);
942 }
943 }
944
945 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
946}
947
948// Scalar i8 / i16 operations will typically be made after first extending
949// the operands to i32.
950static unsigned getOperandsExtensionCost(const Instruction *I) {
951 unsigned ExtCost = 0;
952 for (Value *Op : I->operands())
953 // A load of i8 or i16 sign/zero extends to i32.
954 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
955 ExtCost++;
956
957 return ExtCost;
958}
959
961 Type *CondTy,
962 CmpInst::Predicate VecPred,
964 const Instruction *I) {
966 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
967
968 if (!ValTy->isVectorTy()) {
969 switch (Opcode) {
970 case Instruction::ICmp: {
971 // A loaded value compared with 0 with multiple users becomes Load and
972 // Test. The load is then not foldable, so return 0 cost for the ICmp.
973 unsigned ScalarBits = ValTy->getScalarSizeInBits();
974 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
975 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
976 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
977 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
978 C->isZero())
979 return 0;
980
981 unsigned Cost = 1;
982 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
983 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
984 return Cost;
985 }
986 case Instruction::Select:
987 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
988 return 4; // No LOC for FP / i128 - costs a conditional jump.
989 return 1; // Load On Condition / Select Register.
990 }
991 }
992 else if (ST->hasVector()) {
993 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
994
995 // Called with a compare instruction.
996 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
997 unsigned PredicateExtraCost = 0;
998 if (I != nullptr) {
999 // Some predicates cost one or two extra instructions.
1000 switch (cast<CmpInst>(I)->getPredicate()) {
1006 PredicateExtraCost = 1;
1007 break;
1012 PredicateExtraCost = 2;
1013 break;
1014 default:
1015 break;
1016 }
1017 }
1018
1019 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1020 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1021 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1022 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1023
1024 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1025 return Cost;
1026 }
1027 else { // Called with a select instruction.
1028 assert (Opcode == Instruction::Select);
1029
1030 // We can figure out the extra cost of packing / unpacking if the
1031 // instruction was passed and the compare instruction is found.
1032 unsigned PackCost = 0;
1033 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1034 if (CmpOpTy != nullptr)
1035 PackCost =
1036 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1037
1038 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1039 }
1040 }
1041
1042 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1043}
1044
1047 unsigned Index, Value *Op0,
1048 Value *Op1) {
1049 // vlvgp will insert two grs into a vector register, so only count half the
1050 // number of instructions.
1051 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1052 return ((Index % 2 == 0) ? 1 : 0);
1053
1054 if (Opcode == Instruction::ExtractElement) {
1055 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1056
1057 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1058 if (Index == 0 && Val->isIntOrIntVectorTy())
1059 Cost += 1;
1060
1061 return Cost;
1062 }
1063
1064 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1065}
1066
1067// Check if a load may be folded as a memory operand in its user.
1069isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1070 if (!Ld->hasOneUse())
1071 return false;
1072 FoldedValue = Ld;
1073 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1074 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1075 unsigned TruncBits = 0;
1076 unsigned SExtBits = 0;
1077 unsigned ZExtBits = 0;
1078 if (UserI->hasOneUse()) {
1079 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1080 if (isa<TruncInst>(UserI))
1081 TruncBits = UserBits;
1082 else if (isa<SExtInst>(UserI))
1083 SExtBits = UserBits;
1084 else if (isa<ZExtInst>(UserI))
1085 ZExtBits = UserBits;
1086 }
1087 if (TruncBits || SExtBits || ZExtBits) {
1088 FoldedValue = UserI;
1089 UserI = cast<Instruction>(*UserI->user_begin());
1090 // Load (single use) -> trunc/extend (single use) -> UserI
1091 }
1092 if ((UserI->getOpcode() == Instruction::Sub ||
1093 UserI->getOpcode() == Instruction::SDiv ||
1094 UserI->getOpcode() == Instruction::UDiv) &&
1095 UserI->getOperand(1) != FoldedValue)
1096 return false; // Not commutative, only RHS foldable.
1097 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1098 // extension was made of the load.
1099 unsigned LoadOrTruncBits =
1100 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1101 switch (UserI->getOpcode()) {
1102 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1103 case Instruction::Sub:
1104 case Instruction::ICmp:
1105 if (LoadedBits == 32 && ZExtBits == 64)
1106 return true;
1107 [[fallthrough]];
1108 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1109 if (UserI->getOpcode() != Instruction::ICmp) {
1110 if (LoadedBits == 16 &&
1111 (SExtBits == 32 ||
1112 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1113 return true;
1114 if (LoadOrTruncBits == 16)
1115 return true;
1116 }
1117 [[fallthrough]];
1118 case Instruction::SDiv:// SE: 32->64
1119 if (LoadedBits == 32 && SExtBits == 64)
1120 return true;
1121 [[fallthrough]];
1122 case Instruction::UDiv:
1123 case Instruction::And:
1124 case Instruction::Or:
1125 case Instruction::Xor:
1126 // This also makes sense for float operations, but disabled for now due
1127 // to regressions.
1128 // case Instruction::FCmp:
1129 // case Instruction::FAdd:
1130 // case Instruction::FSub:
1131 // case Instruction::FMul:
1132 // case Instruction::FDiv:
1133
1134 // All possible extensions of memory checked above.
1135
1136 // Comparison between memory and immediate.
1137 if (UserI->getOpcode() == Instruction::ICmp)
1138 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1139 if (CI->getValue().isIntN(16))
1140 return true;
1141 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1142 break;
1143 }
1144 return false;
1145}
1146
1147static bool isBswapIntrinsicCall(const Value *V) {
1148 if (const Instruction *I = dyn_cast<Instruction>(V))
1149 if (auto *CI = dyn_cast<CallInst>(I))
1150 if (auto *F = CI->getCalledFunction())
1151 if (F->getIntrinsicID() == Intrinsic::bswap)
1152 return true;
1153 return false;
1154}
1155
1157 MaybeAlign Alignment,
1158 unsigned AddressSpace,
1160 TTI::OperandValueInfo OpInfo,
1161 const Instruction *I) {
1162 assert(!Src->isVoidTy() && "Invalid type");
1163
1164 // TODO: Handle other cost kinds.
1166 return 1;
1167
1168 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1169 // Store the load or its truncated or extended value in FoldedValue.
1170 const Instruction *FoldedValue = nullptr;
1171 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1172 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1173 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1174
1175 // UserI can't fold two loads, so in that case return 0 cost only
1176 // half of the time.
1177 for (unsigned i = 0; i < 2; ++i) {
1178 if (UserI->getOperand(i) == FoldedValue)
1179 continue;
1180
1181 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1182 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1183 if (!OtherLoad &&
1184 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1185 isa<ZExtInst>(OtherOp)))
1186 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1187 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1188 return i == 0; // Both operands foldable.
1189 }
1190 }
1191
1192 return 0; // Only I is foldable in user.
1193 }
1194 }
1195
1196 // Type legalization (via getNumberOfParts) can't handle structs
1197 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1198 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1199 CostKind);
1200
1201 // FP128 is a legal type but kept in a register pair on older CPUs.
1202 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1203 return 2;
1204
1205 unsigned NumOps =
1206 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1207
1208 // Store/Load reversed saves one instruction.
1209 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1210 I != nullptr) {
1211 if (Opcode == Instruction::Load && I->hasOneUse()) {
1212 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1213 // In case of load -> bswap -> store, return normal cost for the load.
1214 if (isBswapIntrinsicCall(LdUser) &&
1215 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1216 return 0;
1217 }
1218 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1219 const Value *StoredVal = SI->getValueOperand();
1220 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1221 return 0;
1222 }
1223 }
1224
1225 return NumOps;
1226}
1227
1228// The generic implementation of getInterleavedMemoryOpCost() is based on
1229// adding costs of the memory operations plus all the extracts and inserts
1230// needed for using / defining the vector operands. The SystemZ version does
1231// roughly the same but bases the computations on vector permutations
1232// instead.
1234 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1235 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1236 bool UseMaskForCond, bool UseMaskForGaps) {
1237 if (UseMaskForCond || UseMaskForGaps)
1238 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1239 Alignment, AddressSpace, CostKind,
1240 UseMaskForCond, UseMaskForGaps);
1241 assert(isa<VectorType>(VecTy) &&
1242 "Expect a vector type for interleaved memory op");
1243
1244 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1245 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1246 unsigned VF = NumElts / Factor;
1247 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1248 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1249 unsigned NumPermutes = 0;
1250
1251 if (Opcode == Instruction::Load) {
1252 // Loading interleave groups may have gaps, which may mean fewer
1253 // loads. Find out how many vectors will be loaded in total, and in how
1254 // many of them each value will be in.
1255 BitVector UsedInsts(NumVectorMemOps, false);
1256 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1257 for (unsigned Index : Indices)
1258 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1259 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1260 UsedInsts.set(Vec);
1261 ValueVecs[Index].set(Vec);
1262 }
1263 NumVectorMemOps = UsedInsts.count();
1264
1265 for (unsigned Index : Indices) {
1266 // Estimate that each loaded source vector containing this Index
1267 // requires one operation, except that vperm can handle two input
1268 // registers first time for each dst vector.
1269 unsigned NumSrcVecs = ValueVecs[Index].count();
1270 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1271 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1272 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1273 }
1274 } else {
1275 // Estimate the permutes for each stored vector as the smaller of the
1276 // number of elements and the number of source vectors. Subtract one per
1277 // dst vector for vperm (S.A.).
1278 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1279 unsigned NumDstVecs = NumVectorMemOps;
1280 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1281 }
1282
1283 // Cost of load/store operations and the permutations needed.
1284 return NumVectorMemOps + NumPermutes;
1285}
1286
1288 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1289 return getNumVectorRegs(RetTy); // VPERM
1290 return -1;
1291}
1292
1298 if (Cost != -1)
1299 return Cost;
1301}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:885
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:963
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:756
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:648
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1259
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1481
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:770
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:780
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:810
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:804
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:788
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:791
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:789
@ ICMP_NE
not equal
Definition: InstrTypes.h:802
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:808
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:806
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:790
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:137
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:250
An instruction for reading from memory.
Definition: Instructions.h:178
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:302
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getNumberOfRegisters(unsigned ClassID) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:332
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:335
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...