LLVM 19.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/Support/Debug.h"
24
25using namespace llvm;
26
27#define DEBUG_TYPE "systemztti"
28
29//===----------------------------------------------------------------------===//
30//
31// SystemZ cost model.
32//
33//===----------------------------------------------------------------------===//
34
35static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
36 bool UsedAsMemCpySource = false;
37 for (const User *U : V->users())
38 if (const Instruction *User = dyn_cast<Instruction>(U)) {
39 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
40 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
41 continue;
42 }
43 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
44 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
45 UsedAsMemCpySource = true;
46 continue;
47 }
48 }
49 OtherUse = true;
50 }
51 return UsedAsMemCpySource;
52}
53
55 unsigned Bonus = 0;
56
57 // Increase the threshold if an incoming argument is used only as a memcpy
58 // source.
59 if (Function *Callee = CB->getCalledFunction())
60 for (Argument &Arg : Callee->args()) {
61 bool OtherUse = false;
62 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
63 Bonus += 150;
64 }
65
66 LLVM_DEBUG(if (Bonus)
67 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
68 return Bonus;
69}
70
73 assert(Ty->isIntegerTy());
74
75 unsigned BitSize = Ty->getPrimitiveSizeInBits();
76 // There is no cost model for constants with a bit size of 0. Return TCC_Free
77 // here, so that constant hoisting will ignore this constant.
78 if (BitSize == 0)
79 return TTI::TCC_Free;
80 // No cost model for operations on integers larger than 128 bit implemented yet.
81 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
82 return TTI::TCC_Free;
83
84 if (Imm == 0)
85 return TTI::TCC_Free;
86
87 if (Imm.getBitWidth() <= 64) {
88 // Constants loaded via lgfi.
89 if (isInt<32>(Imm.getSExtValue()))
90 return TTI::TCC_Basic;
91 // Constants loaded via llilf.
92 if (isUInt<32>(Imm.getZExtValue()))
93 return TTI::TCC_Basic;
94 // Constants loaded via llihf:
95 if ((Imm.getZExtValue() & 0xffffffff) == 0)
96 return TTI::TCC_Basic;
97
98 return 2 * TTI::TCC_Basic;
99 }
100
101 // i128 immediates loads from Constant Pool
102 return 2 * TTI::TCC_Basic;
103}
104
106 const APInt &Imm, Type *Ty,
108 Instruction *Inst) {
109 assert(Ty->isIntegerTy());
110
111 unsigned BitSize = Ty->getPrimitiveSizeInBits();
112 // There is no cost model for constants with a bit size of 0. Return TCC_Free
113 // here, so that constant hoisting will ignore this constant.
114 if (BitSize == 0)
115 return TTI::TCC_Free;
116 // No cost model for operations on integers larger than 64 bit implemented yet.
117 if (BitSize > 64)
118 return TTI::TCC_Free;
119
120 switch (Opcode) {
121 default:
122 return TTI::TCC_Free;
123 case Instruction::GetElementPtr:
124 // Always hoist the base address of a GetElementPtr. This prevents the
125 // creation of new constants for every base constant that gets constant
126 // folded with the offset.
127 if (Idx == 0)
128 return 2 * TTI::TCC_Basic;
129 return TTI::TCC_Free;
130 case Instruction::Store:
131 if (Idx == 0 && Imm.getBitWidth() <= 64) {
132 // Any 8-bit immediate store can by implemented via mvi.
133 if (BitSize == 8)
134 return TTI::TCC_Free;
135 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
136 if (isInt<16>(Imm.getSExtValue()))
137 return TTI::TCC_Free;
138 }
139 break;
140 case Instruction::ICmp:
141 if (Idx == 1 && Imm.getBitWidth() <= 64) {
142 // Comparisons against signed 32-bit immediates implemented via cgfi.
143 if (isInt<32>(Imm.getSExtValue()))
144 return TTI::TCC_Free;
145 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
146 if (isUInt<32>(Imm.getZExtValue()))
147 return TTI::TCC_Free;
148 }
149 break;
150 case Instruction::Add:
151 case Instruction::Sub:
152 if (Idx == 1 && Imm.getBitWidth() <= 64) {
153 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
154 if (isUInt<32>(Imm.getZExtValue()))
155 return TTI::TCC_Free;
156 // Or their negation, by swapping addition vs. subtraction.
157 if (isUInt<32>(-Imm.getSExtValue()))
158 return TTI::TCC_Free;
159 }
160 break;
161 case Instruction::Mul:
162 if (Idx == 1 && Imm.getBitWidth() <= 64) {
163 // We use msgfi to multiply by 32-bit signed immediates.
164 if (isInt<32>(Imm.getSExtValue()))
165 return TTI::TCC_Free;
166 }
167 break;
168 case Instruction::Or:
169 case Instruction::Xor:
170 if (Idx == 1 && Imm.getBitWidth() <= 64) {
171 // Masks supported by oilf/xilf.
172 if (isUInt<32>(Imm.getZExtValue()))
173 return TTI::TCC_Free;
174 // Masks supported by oihf/xihf.
175 if ((Imm.getZExtValue() & 0xffffffff) == 0)
176 return TTI::TCC_Free;
177 }
178 break;
179 case Instruction::And:
180 if (Idx == 1 && Imm.getBitWidth() <= 64) {
181 // Any 32-bit AND operation can by implemented via nilf.
182 if (BitSize <= 32)
183 return TTI::TCC_Free;
184 // 64-bit masks supported by nilf.
185 if (isUInt<32>(~Imm.getZExtValue()))
186 return TTI::TCC_Free;
187 // 64-bit masks supported by nilh.
188 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
189 return TTI::TCC_Free;
190 // Some 64-bit AND operations can be implemented via risbg.
191 const SystemZInstrInfo *TII = ST->getInstrInfo();
192 unsigned Start, End;
193 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
194 return TTI::TCC_Free;
195 }
196 break;
197 case Instruction::Shl:
198 case Instruction::LShr:
199 case Instruction::AShr:
200 // Always return TCC_Free for the shift value of a shift instruction.
201 if (Idx == 1)
202 return TTI::TCC_Free;
203 break;
204 case Instruction::UDiv:
205 case Instruction::SDiv:
206 case Instruction::URem:
207 case Instruction::SRem:
208 case Instruction::Trunc:
209 case Instruction::ZExt:
210 case Instruction::SExt:
211 case Instruction::IntToPtr:
212 case Instruction::PtrToInt:
213 case Instruction::BitCast:
214 case Instruction::PHI:
215 case Instruction::Call:
216 case Instruction::Select:
217 case Instruction::Ret:
218 case Instruction::Load:
219 break;
220 }
221
223}
224
227 const APInt &Imm, Type *Ty,
229 assert(Ty->isIntegerTy());
230
231 unsigned BitSize = Ty->getPrimitiveSizeInBits();
232 // There is no cost model for constants with a bit size of 0. Return TCC_Free
233 // here, so that constant hoisting will ignore this constant.
234 if (BitSize == 0)
235 return TTI::TCC_Free;
236 // No cost model for operations on integers larger than 64 bit implemented yet.
237 if (BitSize > 64)
238 return TTI::TCC_Free;
239
240 switch (IID) {
241 default:
242 return TTI::TCC_Free;
243 case Intrinsic::sadd_with_overflow:
244 case Intrinsic::uadd_with_overflow:
245 case Intrinsic::ssub_with_overflow:
246 case Intrinsic::usub_with_overflow:
247 // These get expanded to include a normal addition/subtraction.
248 if (Idx == 1 && Imm.getBitWidth() <= 64) {
249 if (isUInt<32>(Imm.getZExtValue()))
250 return TTI::TCC_Free;
251 if (isUInt<32>(-Imm.getSExtValue()))
252 return TTI::TCC_Free;
253 }
254 break;
255 case Intrinsic::smul_with_overflow:
256 case Intrinsic::umul_with_overflow:
257 // These get expanded to include a normal multiplication.
258 if (Idx == 1 && Imm.getBitWidth() <= 64) {
259 if (isInt<32>(Imm.getSExtValue()))
260 return TTI::TCC_Free;
261 }
262 break;
263 case Intrinsic::experimental_stackmap:
264 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
265 return TTI::TCC_Free;
266 break;
267 case Intrinsic::experimental_patchpoint_void:
268 case Intrinsic::experimental_patchpoint:
269 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
270 return TTI::TCC_Free;
271 break;
272 }
274}
275
278 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
279 if (ST->hasPopulationCount() && TyWidth <= 64)
281 return TTI::PSK_Software;
282}
283
287 // Find out if L contains a call, what the machine instruction count
288 // estimate is, and how many stores there are.
289 bool HasCall = false;
290 InstructionCost NumStores = 0;
291 for (auto &BB : L->blocks())
292 for (auto &I : *BB) {
293 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
294 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
295 if (isLoweredToCall(F))
296 HasCall = true;
297 if (F->getIntrinsicID() == Intrinsic::memcpy ||
298 F->getIntrinsicID() == Intrinsic::memset)
299 NumStores++;
300 } else { // indirect call.
301 HasCall = true;
302 }
303 }
304 if (isa<StoreInst>(&I)) {
305 Type *MemAccessTy = I.getOperand(0)->getType();
306 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
307 std::nullopt, 0, TTI::TCK_RecipThroughput);
308 }
309 }
310
311 // The z13 processor will run out of store tags if too many stores
312 // are fed into it too quickly. Therefore make sure there are not
313 // too many stores in the resulting unrolled loop.
314 unsigned const NumStoresVal = *NumStores.getValue();
315 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
316
317 if (HasCall) {
318 // Only allow full unrolling if loop has any calls.
319 UP.FullUnrollMaxCount = Max;
320 UP.MaxCount = 1;
321 return;
322 }
323
324 UP.MaxCount = Max;
325 if (UP.MaxCount <= 1)
326 return;
327
328 // Allow partial and runtime trip count unrolling.
329 UP.Partial = UP.Runtime = true;
330
331 UP.PartialThreshold = 75;
333
334 // Allow expensive instructions in the pre-header of the loop.
335 UP.AllowExpensiveTripCount = true;
336
337 UP.Force = true;
338}
339
343}
344
347 // SystemZ specific: check instruction count (first), and don't care about
348 // ImmCost, since offsets are checked explicitly.
349 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
350 C1.NumIVMuls, C1.NumBaseAdds,
351 C1.ScaleCost, C1.SetupCost) <
352 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
353 C2.NumIVMuls, C2.NumBaseAdds,
354 C2.ScaleCost, C2.SetupCost);
355}
356
357unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
358 bool Vector = (ClassID == 1);
359 if (!Vector)
360 // Discount the stack pointer. Also leave out %r0, since it can't
361 // be used in an address.
362 return 14;
363 if (ST->hasVector())
364 return 32;
365 return 0;
366}
367
370 switch (K) {
372 return TypeSize::getFixed(64);
374 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
376 return TypeSize::getScalable(0);
377 }
378
379 llvm_unreachable("Unsupported register kind");
380}
381
382unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
383 unsigned NumStridedMemAccesses,
384 unsigned NumPrefetches,
385 bool HasCall) const {
386 // Don't prefetch a loop with many far apart accesses.
387 if (NumPrefetches > 16)
388 return UINT_MAX;
389
390 // Emit prefetch instructions for smaller strides in cases where we think
391 // the hardware prefetcher might not be able to keep up.
392 if (NumStridedMemAccesses > 32 && !HasCall &&
393 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
394 return 1;
395
396 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
397}
398
399bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
400 EVT VT = TLI->getValueType(DL, DataType);
401 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
402}
403
404// Return the bit size for the scalar type or vector element
405// type. getScalarSizeInBits() returns 0 for a pointer type.
406static unsigned getScalarSizeInBits(Type *Ty) {
407 unsigned Size =
408 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
409 assert(Size > 0 && "Element must have non-zero size.");
410 return Size;
411}
412
413// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
414// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
415// 3.
416static unsigned getNumVectorRegs(Type *Ty) {
417 auto *VTy = cast<FixedVectorType>(Ty);
418 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
419 assert(WideBits > 0 && "Could not compute size of vector");
420 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
421}
422
424 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
427 const Instruction *CxtI) {
428
429 // TODO: Handle more cost kinds.
431 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
432 Op2Info, Args, CxtI);
433
434 // TODO: return a good value for BB-VECTORIZER that includes the
435 // immediate loads, which we do not want to count for the loop
436 // vectorizer, since they are hopefully hoisted out of the loop. This
437 // would require a new parameter 'InLoop', but not sure if constant
438 // args are common enough to motivate this.
439
440 unsigned ScalarBits = Ty->getScalarSizeInBits();
441
442 // There are thre cases of division and remainder: Dividing with a register
443 // needs a divide instruction. A divisor which is a power of two constant
444 // can be implemented with a sequence of shifts. Any other constant needs a
445 // multiply and shifts.
446 const unsigned DivInstrCost = 20;
447 const unsigned DivMulSeqCost = 10;
448 const unsigned SDivPow2Cost = 4;
449
450 bool SignedDivRem =
451 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
452 bool UnsignedDivRem =
453 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
454
455 // Check for a constant divisor.
456 bool DivRemConst = false;
457 bool DivRemConstPow2 = false;
458 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
459 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
460 const ConstantInt *CVal =
461 (C->getType()->isVectorTy()
462 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
463 : dyn_cast<const ConstantInt>(C));
464 if (CVal && (CVal->getValue().isPowerOf2() ||
465 CVal->getValue().isNegatedPowerOf2()))
466 DivRemConstPow2 = true;
467 else
468 DivRemConst = true;
469 }
470 }
471
472 if (!Ty->isVectorTy()) {
473 // These FP operations are supported with a dedicated instruction for
474 // float, double and fp128 (base implementation assumes float generally
475 // costs 2).
476 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
477 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
478 return 1;
479
480 // There is no native support for FRem.
481 if (Opcode == Instruction::FRem)
482 return LIBCALL_COST;
483
484 // Give discount for some combined logical operations if supported.
485 if (Args.size() == 2) {
486 if (Opcode == Instruction::Xor) {
487 for (const Value *A : Args) {
488 if (const Instruction *I = dyn_cast<Instruction>(A))
489 if (I->hasOneUse() &&
490 (I->getOpcode() == Instruction::Or ||
491 I->getOpcode() == Instruction::And ||
492 I->getOpcode() == Instruction::Xor))
493 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
494 (isInt128InVR(Ty) &&
495 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
496 return 0;
497 }
498 }
499 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
500 for (const Value *A : Args) {
501 if (const Instruction *I = dyn_cast<Instruction>(A))
502 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
503 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
504 (isInt128InVR(Ty) &&
505 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
506 return 0;
507 }
508 }
509 }
510
511 // Or requires one instruction, although it has custom handling for i64.
512 if (Opcode == Instruction::Or)
513 return 1;
514
515 if (Opcode == Instruction::Xor && ScalarBits == 1) {
516 if (ST->hasLoadStoreOnCond2())
517 return 5; // 2 * (li 0; loc 1); xor
518 return 7; // 2 * ipm sequences ; xor ; shift ; compare
519 }
520
521 if (DivRemConstPow2)
522 return (SignedDivRem ? SDivPow2Cost : 1);
523 if (DivRemConst)
524 return DivMulSeqCost;
525 if (SignedDivRem || UnsignedDivRem)
526 return DivInstrCost;
527 }
528 else if (ST->hasVector()) {
529 auto *VTy = cast<FixedVectorType>(Ty);
530 unsigned VF = VTy->getNumElements();
531 unsigned NumVectors = getNumVectorRegs(Ty);
532
533 // These vector operations are custom handled, but are still supported
534 // with one instruction per vector, regardless of element size.
535 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
536 Opcode == Instruction::AShr) {
537 return NumVectors;
538 }
539
540 if (DivRemConstPow2)
541 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
542 if (DivRemConst) {
543 SmallVector<Type *> Tys(Args.size(), Ty);
544 return VF * DivMulSeqCost +
545 getScalarizationOverhead(VTy, Args, Tys, CostKind);
546 }
547 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
548 // Temporary hack: disable high vectorization factors with integer
549 // division/remainder, which will get scalarized and handled with
550 // GR128 registers. The mischeduler is not clever enough to avoid
551 // spilling yet.
552 return 1000;
553
554 // These FP operations are supported with a single vector instruction for
555 // double (base implementation assumes float generally costs 2). For
556 // FP128, the scalar cost is 1, and there is no overhead since the values
557 // are already in scalar registers.
558 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
559 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
560 switch (ScalarBits) {
561 case 32: {
562 // The vector enhancements facility 1 provides v4f32 instructions.
563 if (ST->hasVectorEnhancements1())
564 return NumVectors;
565 // Return the cost of multiple scalar invocation plus the cost of
566 // inserting and extracting the values.
567 InstructionCost ScalarCost =
569 SmallVector<Type *> Tys(Args.size(), Ty);
571 (VF * ScalarCost) +
572 getScalarizationOverhead(VTy, Args, Tys, CostKind);
573 // FIXME: VF 2 for these FP operations are currently just as
574 // expensive as for VF 4.
575 if (VF == 2)
576 Cost *= 2;
577 return Cost;
578 }
579 case 64:
580 case 128:
581 return NumVectors;
582 default:
583 break;
584 }
585 }
586
587 // There is no native support for FRem.
588 if (Opcode == Instruction::FRem) {
589 SmallVector<Type *> Tys(Args.size(), Ty);
590 InstructionCost Cost = (VF * LIBCALL_COST) +
591 getScalarizationOverhead(VTy, Args, Tys, CostKind);
592 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
593 if (VF == 2 && ScalarBits == 32)
594 Cost *= 2;
595 return Cost;
596 }
597 }
598
599 // Fallback to the default implementation.
600 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
601 Args, CxtI);
602}
603
605 VectorType *Tp,
606 ArrayRef<int> Mask,
608 int Index, VectorType *SubTp,
610 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
611 if (ST->hasVector()) {
612 unsigned NumVectors = getNumVectorRegs(Tp);
613
614 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
615
616 // FP128 values are always in scalar registers, so there is no work
617 // involved with a shuffle, except for broadcast. In that case register
618 // moves are done with a single instruction per element.
619 if (Tp->getScalarType()->isFP128Ty())
620 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
621
622 switch (Kind) {
624 // ExtractSubvector Index indicates start offset.
625
626 // Extracting a subvector from first index is a noop.
627 return (Index == 0 ? 0 : NumVectors);
628
630 // Loop vectorizer calls here to figure out the extra cost of
631 // broadcasting a loaded value to all elements of a vector. Since vlrep
632 // loads and replicates with a single instruction, adjust the returned
633 // value.
634 return NumVectors - 1;
635
636 default:
637
638 // SystemZ supports single instruction permutation / replication.
639 return NumVectors;
640 }
641 }
642
643 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
644}
645
646// Return the log2 difference of the element sizes of the two vector types.
647static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
648 unsigned Bits0 = Ty0->getScalarSizeInBits();
649 unsigned Bits1 = Ty1->getScalarSizeInBits();
650
651 if (Bits1 > Bits0)
652 return (Log2_32(Bits1) - Log2_32(Bits0));
653
654 return (Log2_32(Bits0) - Log2_32(Bits1));
655}
656
657// Return the number of instructions needed to truncate SrcTy to DstTy.
659getVectorTruncCost(Type *SrcTy, Type *DstTy) {
660 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
663 "Packing must reduce size of vector type.");
664 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
665 cast<FixedVectorType>(DstTy)->getNumElements() &&
666 "Packing should not change number of elements.");
667
668 // TODO: Since fp32 is expanded, the extract cost should always be 0.
669
670 unsigned NumParts = getNumVectorRegs(SrcTy);
671 if (NumParts <= 2)
672 // Up to 2 vector registers can be truncated efficiently with pack or
673 // permute. The latter requires an immediate mask to be loaded, which
674 // typically gets hoisted out of a loop. TODO: return a good value for
675 // BB-VECTORIZER that includes the immediate loads, which we do not want
676 // to count for the loop vectorizer.
677 return 1;
678
679 unsigned Cost = 0;
680 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
681 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
682 for (unsigned P = 0; P < Log2Diff; ++P) {
683 if (NumParts > 1)
684 NumParts /= 2;
685 Cost += NumParts;
686 }
687
688 // Currently, a general mix of permutes and pack instructions is output by
689 // isel, which follow the cost computation above except for this case which
690 // is one instruction less:
691 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
692 DstTy->getScalarSizeInBits() == 8)
693 Cost--;
694
695 return Cost;
696}
697
698// Return the cost of converting a vector bitmask produced by a compare
699// (SrcTy), to the type of the select or extend instruction (DstTy).
702 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
703 "Should only be called with vector types.");
704
705 unsigned PackCost = 0;
706 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
707 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
708 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
709 if (SrcScalarBits > DstScalarBits)
710 // The bitmask will be truncated.
711 PackCost = getVectorTruncCost(SrcTy, DstTy);
712 else if (SrcScalarBits < DstScalarBits) {
713 unsigned DstNumParts = getNumVectorRegs(DstTy);
714 // Each vector select needs its part of the bitmask unpacked.
715 PackCost = Log2Diff * DstNumParts;
716 // Extra cost for moving part of mask before unpacking.
717 PackCost += DstNumParts - 1;
718 }
719
720 return PackCost;
721}
722
723// Return the type of the compared operands. This is needed to compute the
724// cost for a Select / ZExt or SExt instruction.
725static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
726 Type *OpTy = nullptr;
727 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
728 OpTy = CI->getOperand(0)->getType();
729 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
730 if (LogicI->getNumOperands() == 2)
731 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
732 if (isa<CmpInst>(LogicI->getOperand(1)))
733 OpTy = CI0->getOperand(0)->getType();
734
735 if (OpTy != nullptr) {
736 if (VF == 1) {
737 assert (!OpTy->isVectorTy() && "Expected scalar type");
738 return OpTy;
739 }
740 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
741 // be either scalar or already vectorized with a same or lesser VF.
742 Type *ElTy = OpTy->getScalarType();
743 return FixedVectorType::get(ElTy, VF);
744 }
745
746 return nullptr;
747}
748
749// Get the cost of converting a boolean vector to a vector with same width
750// and element size as Dst, plus the cost of zero extending if needed.
752getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
753 const Instruction *I) {
754 auto *DstVTy = cast<FixedVectorType>(Dst);
755 unsigned VF = DstVTy->getNumElements();
756 unsigned Cost = 0;
757 // If we know what the widths of the compared operands, get any cost of
758 // converting it to match Dst. Otherwise assume same widths.
759 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
760 if (CmpOpTy != nullptr)
761 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
762 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
763 // One 'vn' per dst vector with an immediate mask.
764 Cost += getNumVectorRegs(Dst);
765 return Cost;
766}
767
769 Type *Src,
772 const Instruction *I) {
773 // FIXME: Can the logic below also be used for these cost kinds?
775 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
776 return BaseCost == 0 ? BaseCost : 1;
777 }
778
779 unsigned DstScalarBits = Dst->getScalarSizeInBits();
780 unsigned SrcScalarBits = Src->getScalarSizeInBits();
781
782 if (!Src->isVectorTy()) {
783 assert (!Dst->isVectorTy());
784
785 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
786 if (Src->isIntegerTy(128))
787 return LIBCALL_COST;
788 if (SrcScalarBits >= 32 ||
789 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
790 return 1;
791 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
792 }
793
794 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
795 Dst->isIntegerTy(128))
796 return LIBCALL_COST;
797
798 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
799 if (Src->isIntegerTy(1)) {
800 if (DstScalarBits == 128)
801 return 5 /*branch seq.*/;
802
803 if (ST->hasLoadStoreOnCond2())
804 return 2; // li 0; loc 1
805
806 // This should be extension of a compare i1 result, which is done with
807 // ipm and a varying sequence of instructions.
808 unsigned Cost = 0;
809 if (Opcode == Instruction::SExt)
810 Cost = (DstScalarBits < 64 ? 3 : 4);
811 if (Opcode == Instruction::ZExt)
812 Cost = 3;
813 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
814 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
815 // If operands of an fp-type was compared, this costs +1.
816 Cost++;
817 return Cost;
818 }
819 else if (isInt128InVR(Dst)) {
820 // Extensions from GPR to i128 (in VR) typically costs two instructions,
821 // but a zero-extending load would be just one extra instruction.
822 if (Opcode == Instruction::ZExt && I != nullptr)
823 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
824 if (Ld->hasOneUse())
825 return 1;
826 return 2;
827 }
828 }
829
830 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
831 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
832 if (Ld->hasOneUse())
833 return 0; // Will be converted to GPR load.
834 bool OnlyTruncatingStores = true;
835 for (const User *U : I->users())
836 if (!isa<StoreInst>(U)) {
837 OnlyTruncatingStores = false;
838 break;
839 }
840 if (OnlyTruncatingStores)
841 return 0;
842 return 2; // Vector element extraction.
843 }
844 }
845 else if (ST->hasVector()) {
846 // Vector to scalar cast.
847 auto *SrcVecTy = cast<FixedVectorType>(Src);
848 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
849 if (!DstVecTy) {
850 // TODO: tune vector-to-scalar cast.
851 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
852 }
853 unsigned VF = SrcVecTy->getNumElements();
854 unsigned NumDstVectors = getNumVectorRegs(Dst);
855 unsigned NumSrcVectors = getNumVectorRegs(Src);
856
857 if (Opcode == Instruction::Trunc) {
858 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
859 return 0; // Check for NOOP conversions.
860 return getVectorTruncCost(Src, Dst);
861 }
862
863 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
864 if (SrcScalarBits >= 8) {
865 // ZExt will use either a single unpack or a vector permute.
866 if (Opcode == Instruction::ZExt)
867 return NumDstVectors;
868
869 // SExt will be handled with one unpack per doubling of width.
870 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
871
872 // For types that spans multiple vector registers, some additional
873 // instructions are used to setup the unpacking.
874 unsigned NumSrcVectorOps =
875 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
876 : (NumDstVectors / 2));
877
878 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
879 }
880 else if (SrcScalarBits == 1)
881 return getBoolVecToIntConversionCost(Opcode, Dst, I);
882 }
883
884 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
885 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
886 // TODO: Fix base implementation which could simplify things a bit here
887 // (seems to miss on differentiating on scalar/vector types).
888
889 // Only 64 bit vector conversions are natively supported before z15.
890 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
891 if (SrcScalarBits == DstScalarBits)
892 return NumDstVectors;
893
894 if (SrcScalarBits == 1)
895 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
896 }
897
898 // Return the cost of multiple scalar invocation plus the cost of
899 // inserting and extracting the values. Base implementation does not
900 // realize float->int gets scalarized.
902 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
903 InstructionCost TotCost = VF * ScalarCost;
904 bool NeedsInserts = true, NeedsExtracts = true;
905 // FP128 registers do not get inserted or extracted.
906 if (DstScalarBits == 128 &&
907 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
908 NeedsInserts = false;
909 if (SrcScalarBits == 128 &&
910 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
911 NeedsExtracts = false;
912
913 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
914 NeedsExtracts, CostKind);
915 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
916 /*Extract*/ false, CostKind);
917
918 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
919 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
920 TotCost *= 2;
921
922 return TotCost;
923 }
924
925 if (Opcode == Instruction::FPTrunc) {
926 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
927 return VF /*ldxbr/lexbr*/ +
928 getScalarizationOverhead(DstVecTy, /*Insert*/ true,
929 /*Extract*/ false, CostKind);
930 else // double -> float
931 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
932 }
933
934 if (Opcode == Instruction::FPExt) {
935 if (SrcScalarBits == 32 && DstScalarBits == 64) {
936 // float -> double is very rare and currently unoptimized. Instead of
937 // using vldeb, which can do two at a time, all conversions are
938 // scalarized.
939 return VF * 2;
940 }
941 // -> fp128. VF * lxdb/lxeb + extraction of elements.
942 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
943 /*Extract*/ true, CostKind);
944 }
945 }
946
947 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
948}
949
950// Scalar i8 / i16 operations will typically be made after first extending
951// the operands to i32.
952static unsigned getOperandsExtensionCost(const Instruction *I) {
953 unsigned ExtCost = 0;
954 for (Value *Op : I->operands())
955 // A load of i8 or i16 sign/zero extends to i32.
956 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
957 ExtCost++;
958
959 return ExtCost;
960}
961
963 Type *CondTy,
964 CmpInst::Predicate VecPred,
966 const Instruction *I) {
968 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
969
970 if (!ValTy->isVectorTy()) {
971 switch (Opcode) {
972 case Instruction::ICmp: {
973 // A loaded value compared with 0 with multiple users becomes Load and
974 // Test. The load is then not foldable, so return 0 cost for the ICmp.
975 unsigned ScalarBits = ValTy->getScalarSizeInBits();
976 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
977 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
978 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
979 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
980 C->isZero())
981 return 0;
982
983 unsigned Cost = 1;
984 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
985 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
986 return Cost;
987 }
988 case Instruction::Select:
989 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
990 return 4; // No LOC for FP / i128 - costs a conditional jump.
991 return 1; // Load On Condition / Select Register.
992 }
993 }
994 else if (ST->hasVector()) {
995 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
996
997 // Called with a compare instruction.
998 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
999 unsigned PredicateExtraCost = 0;
1000 if (I != nullptr) {
1001 // Some predicates cost one or two extra instructions.
1002 switch (cast<CmpInst>(I)->getPredicate()) {
1008 PredicateExtraCost = 1;
1009 break;
1014 PredicateExtraCost = 2;
1015 break;
1016 default:
1017 break;
1018 }
1019 }
1020
1021 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1022 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1023 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1024 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1025
1026 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1027 return Cost;
1028 }
1029 else { // Called with a select instruction.
1030 assert (Opcode == Instruction::Select);
1031
1032 // We can figure out the extra cost of packing / unpacking if the
1033 // instruction was passed and the compare instruction is found.
1034 unsigned PackCost = 0;
1035 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1036 if (CmpOpTy != nullptr)
1037 PackCost =
1038 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1039
1040 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1041 }
1042 }
1043
1044 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1045}
1046
1049 unsigned Index, Value *Op0,
1050 Value *Op1) {
1051 // vlvgp will insert two grs into a vector register, so only count half the
1052 // number of instructions.
1053 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1054 return ((Index % 2 == 0) ? 1 : 0);
1055
1056 if (Opcode == Instruction::ExtractElement) {
1057 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1058
1059 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1060 if (Index == 0 && Val->isIntOrIntVectorTy())
1061 Cost += 1;
1062
1063 return Cost;
1064 }
1065
1066 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1067}
1068
1069// Check if a load may be folded as a memory operand in its user.
1071isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1072 if (!Ld->hasOneUse())
1073 return false;
1074 FoldedValue = Ld;
1075 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1076 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1077 unsigned TruncBits = 0;
1078 unsigned SExtBits = 0;
1079 unsigned ZExtBits = 0;
1080 if (UserI->hasOneUse()) {
1081 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1082 if (isa<TruncInst>(UserI))
1083 TruncBits = UserBits;
1084 else if (isa<SExtInst>(UserI))
1085 SExtBits = UserBits;
1086 else if (isa<ZExtInst>(UserI))
1087 ZExtBits = UserBits;
1088 }
1089 if (TruncBits || SExtBits || ZExtBits) {
1090 FoldedValue = UserI;
1091 UserI = cast<Instruction>(*UserI->user_begin());
1092 // Load (single use) -> trunc/extend (single use) -> UserI
1093 }
1094 if ((UserI->getOpcode() == Instruction::Sub ||
1095 UserI->getOpcode() == Instruction::SDiv ||
1096 UserI->getOpcode() == Instruction::UDiv) &&
1097 UserI->getOperand(1) != FoldedValue)
1098 return false; // Not commutative, only RHS foldable.
1099 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1100 // extension was made of the load.
1101 unsigned LoadOrTruncBits =
1102 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1103 switch (UserI->getOpcode()) {
1104 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1105 case Instruction::Sub:
1106 case Instruction::ICmp:
1107 if (LoadedBits == 32 && ZExtBits == 64)
1108 return true;
1109 [[fallthrough]];
1110 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1111 if (UserI->getOpcode() != Instruction::ICmp) {
1112 if (LoadedBits == 16 &&
1113 (SExtBits == 32 ||
1114 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1115 return true;
1116 if (LoadOrTruncBits == 16)
1117 return true;
1118 }
1119 [[fallthrough]];
1120 case Instruction::SDiv:// SE: 32->64
1121 if (LoadedBits == 32 && SExtBits == 64)
1122 return true;
1123 [[fallthrough]];
1124 case Instruction::UDiv:
1125 case Instruction::And:
1126 case Instruction::Or:
1127 case Instruction::Xor:
1128 // This also makes sense for float operations, but disabled for now due
1129 // to regressions.
1130 // case Instruction::FCmp:
1131 // case Instruction::FAdd:
1132 // case Instruction::FSub:
1133 // case Instruction::FMul:
1134 // case Instruction::FDiv:
1135
1136 // All possible extensions of memory checked above.
1137
1138 // Comparison between memory and immediate.
1139 if (UserI->getOpcode() == Instruction::ICmp)
1140 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1141 if (CI->getValue().isIntN(16))
1142 return true;
1143 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1144 break;
1145 }
1146 return false;
1147}
1148
1149static bool isBswapIntrinsicCall(const Value *V) {
1150 if (const Instruction *I = dyn_cast<Instruction>(V))
1151 if (auto *CI = dyn_cast<CallInst>(I))
1152 if (auto *F = CI->getCalledFunction())
1153 if (F->getIntrinsicID() == Intrinsic::bswap)
1154 return true;
1155 return false;
1156}
1157
1159 MaybeAlign Alignment,
1160 unsigned AddressSpace,
1162 TTI::OperandValueInfo OpInfo,
1163 const Instruction *I) {
1164 assert(!Src->isVoidTy() && "Invalid type");
1165
1166 // TODO: Handle other cost kinds.
1168 return 1;
1169
1170 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1171 // Store the load or its truncated or extended value in FoldedValue.
1172 const Instruction *FoldedValue = nullptr;
1173 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1174 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1175 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1176
1177 // UserI can't fold two loads, so in that case return 0 cost only
1178 // half of the time.
1179 for (unsigned i = 0; i < 2; ++i) {
1180 if (UserI->getOperand(i) == FoldedValue)
1181 continue;
1182
1183 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1184 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1185 if (!OtherLoad &&
1186 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1187 isa<ZExtInst>(OtherOp)))
1188 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1189 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1190 return i == 0; // Both operands foldable.
1191 }
1192 }
1193
1194 return 0; // Only I is foldable in user.
1195 }
1196 }
1197
1198 // Type legalization (via getNumberOfParts) can't handle structs
1199 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1200 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1201 CostKind);
1202
1203 // FP128 is a legal type but kept in a register pair on older CPUs.
1204 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1205 return 2;
1206
1207 unsigned NumOps =
1208 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1209
1210 // Store/Load reversed saves one instruction.
1211 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1212 I != nullptr) {
1213 if (Opcode == Instruction::Load && I->hasOneUse()) {
1214 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1215 // In case of load -> bswap -> store, return normal cost for the load.
1216 if (isBswapIntrinsicCall(LdUser) &&
1217 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1218 return 0;
1219 }
1220 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1221 const Value *StoredVal = SI->getValueOperand();
1222 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1223 return 0;
1224 }
1225 }
1226
1227 return NumOps;
1228}
1229
1230// The generic implementation of getInterleavedMemoryOpCost() is based on
1231// adding costs of the memory operations plus all the extracts and inserts
1232// needed for using / defining the vector operands. The SystemZ version does
1233// roughly the same but bases the computations on vector permutations
1234// instead.
1236 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1237 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1238 bool UseMaskForCond, bool UseMaskForGaps) {
1239 if (UseMaskForCond || UseMaskForGaps)
1240 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1241 Alignment, AddressSpace, CostKind,
1242 UseMaskForCond, UseMaskForGaps);
1243 assert(isa<VectorType>(VecTy) &&
1244 "Expect a vector type for interleaved memory op");
1245
1246 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1247 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1248 unsigned VF = NumElts / Factor;
1249 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1250 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1251 unsigned NumPermutes = 0;
1252
1253 if (Opcode == Instruction::Load) {
1254 // Loading interleave groups may have gaps, which may mean fewer
1255 // loads. Find out how many vectors will be loaded in total, and in how
1256 // many of them each value will be in.
1257 BitVector UsedInsts(NumVectorMemOps, false);
1258 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1259 for (unsigned Index : Indices)
1260 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1261 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1262 UsedInsts.set(Vec);
1263 ValueVecs[Index].set(Vec);
1264 }
1265 NumVectorMemOps = UsedInsts.count();
1266
1267 for (unsigned Index : Indices) {
1268 // Estimate that each loaded source vector containing this Index
1269 // requires one operation, except that vperm can handle two input
1270 // registers first time for each dst vector.
1271 unsigned NumSrcVecs = ValueVecs[Index].count();
1272 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1273 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1274 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1275 }
1276 } else {
1277 // Estimate the permutes for each stored vector as the smaller of the
1278 // number of elements and the number of source vectors. Subtract one per
1279 // dst vector for vperm (S.A.).
1280 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1281 unsigned NumDstVecs = NumVectorMemOps;
1282 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1283 }
1284
1285 // Cost of load/store operations and the permutations needed.
1286 return NumVectorMemOps + NumPermutes;
1287}
1288
1289static int
1291 const SmallVectorImpl<Type *> &ParamTys) {
1292 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1293 return getNumVectorRegs(RetTy); // VPERM
1294
1295 if (ID == Intrinsic::vector_reduce_add) {
1296 // Retrieve number and size of elements for the vector op.
1297 auto *VTy = cast<FixedVectorType>(ParamTys.front());
1298 unsigned NumElements = VTy->getNumElements();
1299 unsigned ScalarSize = VTy->getScalarSizeInBits();
1300 // For scalar sizes >128 bits, we fall back to the generic cost estimate.
1301 if (ScalarSize > SystemZ::VectorBits)
1302 return -1;
1303 // A single vector register can hold this many elements.
1304 unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
1305 // This many vector regs are needed to represent the input elements (V).
1306 unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1307 // This many instructions are needed for the final sum of vector elems (S).
1308 unsigned LastVectorHandling =
1309 2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
1310 // We use vector adds to create a sum vector, which takes
1311 // V/2 + V/4 + ... = V - 1 operations.
1312 // Then, we need S operations to sum up the elements of that sum vector,
1313 // for a total of V + S - 1 operations.
1314 int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1315 return Cost;
1316 }
1317 return -1;
1318}
1319
1324 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1325 if (Cost != -1)
1326 return Cost;
1328}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1461
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1709
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:950
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:960
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:990
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:984
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:968
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:971
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:969
@ ICMP_NE
not equal
Definition: InstrTypes.h:982
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:988
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:986
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:970
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Definition: Instructions.h:184
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getNumberOfRegisters(unsigned ClassID) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...