LLVM  16.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
19 #include "llvm/CodeGen/CostTable.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
33 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
34  bool UsedAsMemCpySource = false;
35  for (const User *U : V->users())
36  if (const Instruction *User = dyn_cast<Instruction>(U)) {
37  if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
38  UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
39  continue;
40  }
41  if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
42  if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
43  UsedAsMemCpySource = true;
44  continue;
45  }
46  }
47  OtherUse = true;
48  }
49  return UsedAsMemCpySource;
50 }
51 
53  unsigned Bonus = 0;
54 
55  // Increase the threshold if an incoming argument is used only as a memcpy
56  // source.
57  if (Function *Callee = CB->getCalledFunction())
58  for (Argument &Arg : Callee->args()) {
59  bool OtherUse = false;
60  if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
61  Bonus += 150;
62  }
63 
64  LLVM_DEBUG(if (Bonus)
65  dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
66  return Bonus;
67 }
68 
71  assert(Ty->isIntegerTy());
72 
73  unsigned BitSize = Ty->getPrimitiveSizeInBits();
74  // There is no cost model for constants with a bit size of 0. Return TCC_Free
75  // here, so that constant hoisting will ignore this constant.
76  if (BitSize == 0)
77  return TTI::TCC_Free;
78  // No cost model for operations on integers larger than 64 bit implemented yet.
79  if (BitSize > 64)
80  return TTI::TCC_Free;
81 
82  if (Imm == 0)
83  return TTI::TCC_Free;
84 
85  if (Imm.getBitWidth() <= 64) {
86  // Constants loaded via lgfi.
87  if (isInt<32>(Imm.getSExtValue()))
88  return TTI::TCC_Basic;
89  // Constants loaded via llilf.
90  if (isUInt<32>(Imm.getZExtValue()))
91  return TTI::TCC_Basic;
92  // Constants loaded via llihf:
93  if ((Imm.getZExtValue() & 0xffffffff) == 0)
94  return TTI::TCC_Basic;
95 
96  return 2 * TTI::TCC_Basic;
97  }
98 
99  return 4 * TTI::TCC_Basic;
100 }
101 
103  const APInt &Imm, Type *Ty,
105  Instruction *Inst) {
106  assert(Ty->isIntegerTy());
107 
108  unsigned BitSize = Ty->getPrimitiveSizeInBits();
109  // There is no cost model for constants with a bit size of 0. Return TCC_Free
110  // here, so that constant hoisting will ignore this constant.
111  if (BitSize == 0)
112  return TTI::TCC_Free;
113  // No cost model for operations on integers larger than 64 bit implemented yet.
114  if (BitSize > 64)
115  return TTI::TCC_Free;
116 
117  switch (Opcode) {
118  default:
119  return TTI::TCC_Free;
120  case Instruction::GetElementPtr:
121  // Always hoist the base address of a GetElementPtr. This prevents the
122  // creation of new constants for every base constant that gets constant
123  // folded with the offset.
124  if (Idx == 0)
125  return 2 * TTI::TCC_Basic;
126  return TTI::TCC_Free;
127  case Instruction::Store:
128  if (Idx == 0 && Imm.getBitWidth() <= 64) {
129  // Any 8-bit immediate store can by implemented via mvi.
130  if (BitSize == 8)
131  return TTI::TCC_Free;
132  // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
133  if (isInt<16>(Imm.getSExtValue()))
134  return TTI::TCC_Free;
135  }
136  break;
137  case Instruction::ICmp:
138  if (Idx == 1 && Imm.getBitWidth() <= 64) {
139  // Comparisons against signed 32-bit immediates implemented via cgfi.
140  if (isInt<32>(Imm.getSExtValue()))
141  return TTI::TCC_Free;
142  // Comparisons against unsigned 32-bit immediates implemented via clgfi.
143  if (isUInt<32>(Imm.getZExtValue()))
144  return TTI::TCC_Free;
145  }
146  break;
147  case Instruction::Add:
148  case Instruction::Sub:
149  if (Idx == 1 && Imm.getBitWidth() <= 64) {
150  // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
151  if (isUInt<32>(Imm.getZExtValue()))
152  return TTI::TCC_Free;
153  // Or their negation, by swapping addition vs. subtraction.
154  if (isUInt<32>(-Imm.getSExtValue()))
155  return TTI::TCC_Free;
156  }
157  break;
158  case Instruction::Mul:
159  if (Idx == 1 && Imm.getBitWidth() <= 64) {
160  // We use msgfi to multiply by 32-bit signed immediates.
161  if (isInt<32>(Imm.getSExtValue()))
162  return TTI::TCC_Free;
163  }
164  break;
165  case Instruction::Or:
166  case Instruction::Xor:
167  if (Idx == 1 && Imm.getBitWidth() <= 64) {
168  // Masks supported by oilf/xilf.
169  if (isUInt<32>(Imm.getZExtValue()))
170  return TTI::TCC_Free;
171  // Masks supported by oihf/xihf.
172  if ((Imm.getZExtValue() & 0xffffffff) == 0)
173  return TTI::TCC_Free;
174  }
175  break;
176  case Instruction::And:
177  if (Idx == 1 && Imm.getBitWidth() <= 64) {
178  // Any 32-bit AND operation can by implemented via nilf.
179  if (BitSize <= 32)
180  return TTI::TCC_Free;
181  // 64-bit masks supported by nilf.
182  if (isUInt<32>(~Imm.getZExtValue()))
183  return TTI::TCC_Free;
184  // 64-bit masks supported by nilh.
185  if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
186  return TTI::TCC_Free;
187  // Some 64-bit AND operations can be implemented via risbg.
188  const SystemZInstrInfo *TII = ST->getInstrInfo();
189  unsigned Start, End;
190  if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
191  return TTI::TCC_Free;
192  }
193  break;
194  case Instruction::Shl:
195  case Instruction::LShr:
196  case Instruction::AShr:
197  // Always return TCC_Free for the shift value of a shift instruction.
198  if (Idx == 1)
199  return TTI::TCC_Free;
200  break;
201  case Instruction::UDiv:
202  case Instruction::SDiv:
203  case Instruction::URem:
204  case Instruction::SRem:
205  case Instruction::Trunc:
206  case Instruction::ZExt:
207  case Instruction::SExt:
208  case Instruction::IntToPtr:
209  case Instruction::PtrToInt:
210  case Instruction::BitCast:
211  case Instruction::PHI:
212  case Instruction::Call:
213  case Instruction::Select:
214  case Instruction::Ret:
215  case Instruction::Load:
216  break;
217  }
218 
220 }
221 
224  const APInt &Imm, Type *Ty,
226  assert(Ty->isIntegerTy());
227 
228  unsigned BitSize = Ty->getPrimitiveSizeInBits();
229  // There is no cost model for constants with a bit size of 0. Return TCC_Free
230  // here, so that constant hoisting will ignore this constant.
231  if (BitSize == 0)
232  return TTI::TCC_Free;
233  // No cost model for operations on integers larger than 64 bit implemented yet.
234  if (BitSize > 64)
235  return TTI::TCC_Free;
236 
237  switch (IID) {
238  default:
239  return TTI::TCC_Free;
240  case Intrinsic::sadd_with_overflow:
241  case Intrinsic::uadd_with_overflow:
242  case Intrinsic::ssub_with_overflow:
243  case Intrinsic::usub_with_overflow:
244  // These get expanded to include a normal addition/subtraction.
245  if (Idx == 1 && Imm.getBitWidth() <= 64) {
246  if (isUInt<32>(Imm.getZExtValue()))
247  return TTI::TCC_Free;
248  if (isUInt<32>(-Imm.getSExtValue()))
249  return TTI::TCC_Free;
250  }
251  break;
252  case Intrinsic::smul_with_overflow:
253  case Intrinsic::umul_with_overflow:
254  // These get expanded to include a normal multiplication.
255  if (Idx == 1 && Imm.getBitWidth() <= 64) {
256  if (isInt<32>(Imm.getSExtValue()))
257  return TTI::TCC_Free;
258  }
259  break;
260  case Intrinsic::experimental_stackmap:
261  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
262  return TTI::TCC_Free;
263  break;
264  case Intrinsic::experimental_patchpoint_void:
265  case Intrinsic::experimental_patchpoint_i64:
266  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
267  return TTI::TCC_Free;
268  break;
269  }
271 }
272 
275  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
276  if (ST->hasPopulationCount() && TyWidth <= 64)
277  return TTI::PSK_FastHardware;
278  return TTI::PSK_Software;
279 }
280 
284  // Find out if L contains a call, what the machine instruction count
285  // estimate is, and how many stores there are.
286  bool HasCall = false;
287  InstructionCost NumStores = 0;
288  for (auto &BB : L->blocks())
289  for (auto &I : *BB) {
290  if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
291  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
292  if (isLoweredToCall(F))
293  HasCall = true;
294  if (F->getIntrinsicID() == Intrinsic::memcpy ||
295  F->getIntrinsicID() == Intrinsic::memset)
296  NumStores++;
297  } else { // indirect call.
298  HasCall = true;
299  }
300  }
301  if (isa<StoreInst>(&I)) {
302  Type *MemAccessTy = I.getOperand(0)->getType();
303  NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
305  }
306  }
307 
308  // The z13 processor will run out of store tags if too many stores
309  // are fed into it too quickly. Therefore make sure there are not
310  // too many stores in the resulting unrolled loop.
311  unsigned const NumStoresVal = *NumStores.getValue();
312  unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
313 
314  if (HasCall) {
315  // Only allow full unrolling if loop has any calls.
316  UP.FullUnrollMaxCount = Max;
317  UP.MaxCount = 1;
318  return;
319  }
320 
321  UP.MaxCount = Max;
322  if (UP.MaxCount <= 1)
323  return;
324 
325  // Allow partial and runtime trip count unrolling.
326  UP.Partial = UP.Runtime = true;
327 
328  UP.PartialThreshold = 75;
330 
331  // Allow expensive instructions in the pre-header of the loop.
332  UP.AllowExpensiveTripCount = true;
333 
334  UP.Force = true;
335 }
336 
339  BaseT::getPeelingPreferences(L, SE, PP);
340 }
341 
343  const TargetTransformInfo::LSRCost &C2) {
344  // SystemZ specific: check instruction count (first), and don't care about
345  // ImmCost, since offsets are checked explicitly.
346  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
347  C1.NumIVMuls, C1.NumBaseAdds,
348  C1.ScaleCost, C1.SetupCost) <
349  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
350  C2.NumIVMuls, C2.NumBaseAdds,
351  C2.ScaleCost, C2.SetupCost);
352 }
353 
354 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
355  bool Vector = (ClassID == 1);
356  if (!Vector)
357  // Discount the stack pointer. Also leave out %r0, since it can't
358  // be used in an address.
359  return 14;
360  if (ST->hasVector())
361  return 32;
362  return 0;
363 }
364 
365 TypeSize
367  switch (K) {
369  return TypeSize::getFixed(64);
371  return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
373  return TypeSize::getScalable(0);
374  }
375 
376  llvm_unreachable("Unsupported register kind");
377 }
378 
379 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
380  unsigned NumStridedMemAccesses,
381  unsigned NumPrefetches,
382  bool HasCall) const {
383  // Don't prefetch a loop with many far apart accesses.
384  if (NumPrefetches > 16)
385  return UINT_MAX;
386 
387  // Emit prefetch instructions for smaller strides in cases where we think
388  // the hardware prefetcher might not be able to keep up.
389  if (NumStridedMemAccesses > 32 && !HasCall &&
390  (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
391  return 1;
392 
393  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
394 }
395 
397  EVT VT = TLI->getValueType(DL, DataType);
398  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
399 }
400 
401 // Return the bit size for the scalar type or vector element
402 // type. getScalarSizeInBits() returns 0 for a pointer type.
403 static unsigned getScalarSizeInBits(Type *Ty) {
404  unsigned Size =
405  (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
406  assert(Size > 0 && "Element must have non-zero size.");
407  return Size;
408 }
409 
410 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
411 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
412 // 3.
413 static unsigned getNumVectorRegs(Type *Ty) {
414  auto *VTy = cast<FixedVectorType>(Ty);
415  unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
416  assert(WideBits > 0 && "Could not compute size of vector");
417  return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
418 }
419 
421  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
424  const Instruction *CxtI) {
425 
426  // TODO: Handle more cost kinds.
428  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
429  Op2Info, Args, CxtI);
430 
431  // TODO: return a good value for BB-VECTORIZER that includes the
432  // immediate loads, which we do not want to count for the loop
433  // vectorizer, since they are hopefully hoisted out of the loop. This
434  // would require a new parameter 'InLoop', but not sure if constant
435  // args are common enough to motivate this.
436 
437  unsigned ScalarBits = Ty->getScalarSizeInBits();
438 
439  // There are thre cases of division and remainder: Dividing with a register
440  // needs a divide instruction. A divisor which is a power of two constant
441  // can be implemented with a sequence of shifts. Any other constant needs a
442  // multiply and shifts.
443  const unsigned DivInstrCost = 20;
444  const unsigned DivMulSeqCost = 10;
445  const unsigned SDivPow2Cost = 4;
446 
447  bool SignedDivRem =
448  Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
449  bool UnsignedDivRem =
450  Opcode == Instruction::UDiv || Opcode == Instruction::URem;
451 
452  // Check for a constant divisor.
453  bool DivRemConst = false;
454  bool DivRemConstPow2 = false;
455  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
456  if (const Constant *C = dyn_cast<Constant>(Args[1])) {
457  const ConstantInt *CVal =
458  (C->getType()->isVectorTy()
459  ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
460  : dyn_cast<const ConstantInt>(C));
461  if (CVal && (CVal->getValue().isPowerOf2() ||
462  CVal->getValue().isNegatedPowerOf2()))
463  DivRemConstPow2 = true;
464  else
465  DivRemConst = true;
466  }
467  }
468 
469  if (!Ty->isVectorTy()) {
470  // These FP operations are supported with a dedicated instruction for
471  // float, double and fp128 (base implementation assumes float generally
472  // costs 2).
473  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
474  Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
475  return 1;
476 
477  // There is no native support for FRem.
478  if (Opcode == Instruction::FRem)
479  return LIBCALL_COST;
480 
481  // Give discount for some combined logical operations if supported.
482  if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
483  if (Opcode == Instruction::Xor) {
484  for (const Value *A : Args) {
485  if (const Instruction *I = dyn_cast<Instruction>(A))
486  if (I->hasOneUse() &&
487  (I->getOpcode() == Instruction::And ||
488  I->getOpcode() == Instruction::Or ||
489  I->getOpcode() == Instruction::Xor))
490  return 0;
491  }
492  }
493  else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
494  for (const Value *A : Args) {
495  if (const Instruction *I = dyn_cast<Instruction>(A))
496  if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
497  return 0;
498  }
499  }
500  }
501 
502  // Or requires one instruction, although it has custom handling for i64.
503  if (Opcode == Instruction::Or)
504  return 1;
505 
506  if (Opcode == Instruction::Xor && ScalarBits == 1) {
507  if (ST->hasLoadStoreOnCond2())
508  return 5; // 2 * (li 0; loc 1); xor
509  return 7; // 2 * ipm sequences ; xor ; shift ; compare
510  }
511 
512  if (DivRemConstPow2)
513  return (SignedDivRem ? SDivPow2Cost : 1);
514  if (DivRemConst)
515  return DivMulSeqCost;
516  if (SignedDivRem || UnsignedDivRem)
517  return DivInstrCost;
518  }
519  else if (ST->hasVector()) {
520  auto *VTy = cast<FixedVectorType>(Ty);
521  unsigned VF = VTy->getNumElements();
522  unsigned NumVectors = getNumVectorRegs(Ty);
523 
524  // These vector operations are custom handled, but are still supported
525  // with one instruction per vector, regardless of element size.
526  if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
527  Opcode == Instruction::AShr) {
528  return NumVectors;
529  }
530 
531  if (DivRemConstPow2)
532  return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
533  if (DivRemConst) {
534  SmallVector<Type *> Tys(Args.size(), Ty);
535  return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
536  }
537  if ((SignedDivRem || UnsignedDivRem) && VF > 4)
538  // Temporary hack: disable high vectorization factors with integer
539  // division/remainder, which will get scalarized and handled with
540  // GR128 registers. The mischeduler is not clever enough to avoid
541  // spilling yet.
542  return 1000;
543 
544  // These FP operations are supported with a single vector instruction for
545  // double (base implementation assumes float generally costs 2). For
546  // FP128, the scalar cost is 1, and there is no overhead since the values
547  // are already in scalar registers.
548  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
549  Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
550  switch (ScalarBits) {
551  case 32: {
552  // The vector enhancements facility 1 provides v4f32 instructions.
553  if (ST->hasVectorEnhancements1())
554  return NumVectors;
555  // Return the cost of multiple scalar invocation plus the cost of
556  // inserting and extracting the values.
557  InstructionCost ScalarCost =
559  SmallVector<Type *> Tys(Args.size(), Ty);
561  (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
562  // FIXME: VF 2 for these FP operations are currently just as
563  // expensive as for VF 4.
564  if (VF == 2)
565  Cost *= 2;
566  return Cost;
567  }
568  case 64:
569  case 128:
570  return NumVectors;
571  default:
572  break;
573  }
574  }
575 
576  // There is no native support for FRem.
577  if (Opcode == Instruction::FRem) {
578  SmallVector<Type *> Tys(Args.size(), Ty);
580  (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
581  // FIXME: VF 2 for float is currently just as expensive as for VF 4.
582  if (VF == 2 && ScalarBits == 32)
583  Cost *= 2;
584  return Cost;
585  }
586  }
587 
588  // Fallback to the default implementation.
589  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
590  Args, CxtI);
591 }
592 
594  VectorType *Tp,
597  int Index, VectorType *SubTp,
600  if (ST->hasVector()) {
601  unsigned NumVectors = getNumVectorRegs(Tp);
602 
603  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
604 
605  // FP128 values are always in scalar registers, so there is no work
606  // involved with a shuffle, except for broadcast. In that case register
607  // moves are done with a single instruction per element.
608  if (Tp->getScalarType()->isFP128Ty())
609  return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
610 
611  switch (Kind) {
613  // ExtractSubvector Index indicates start offset.
614 
615  // Extracting a subvector from first index is a noop.
616  return (Index == 0 ? 0 : NumVectors);
617 
619  // Loop vectorizer calls here to figure out the extra cost of
620  // broadcasting a loaded value to all elements of a vector. Since vlrep
621  // loads and replicates with a single instruction, adjust the returned
622  // value.
623  return NumVectors - 1;
624 
625  default:
626 
627  // SystemZ supports single instruction permutation / replication.
628  return NumVectors;
629  }
630  }
631 
632  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
633 }
634 
635 // Return the log2 difference of the element sizes of the two vector types.
636 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
637  unsigned Bits0 = Ty0->getScalarSizeInBits();
638  unsigned Bits1 = Ty1->getScalarSizeInBits();
639 
640  if (Bits1 > Bits0)
641  return (Log2_32(Bits1) - Log2_32(Bits0));
642 
643  return (Log2_32(Bits0) - Log2_32(Bits1));
644 }
645 
646 // Return the number of instructions needed to truncate SrcTy to DstTy.
647 unsigned SystemZTTIImpl::
648 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
649  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
652  "Packing must reduce size of vector type.");
653  assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
654  cast<FixedVectorType>(DstTy)->getNumElements() &&
655  "Packing should not change number of elements.");
656 
657  // TODO: Since fp32 is expanded, the extract cost should always be 0.
658 
659  unsigned NumParts = getNumVectorRegs(SrcTy);
660  if (NumParts <= 2)
661  // Up to 2 vector registers can be truncated efficiently with pack or
662  // permute. The latter requires an immediate mask to be loaded, which
663  // typically gets hoisted out of a loop. TODO: return a good value for
664  // BB-VECTORIZER that includes the immediate loads, which we do not want
665  // to count for the loop vectorizer.
666  return 1;
667 
668  unsigned Cost = 0;
669  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
670  unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
671  for (unsigned P = 0; P < Log2Diff; ++P) {
672  if (NumParts > 1)
673  NumParts /= 2;
674  Cost += NumParts;
675  }
676 
677  // Currently, a general mix of permutes and pack instructions is output by
678  // isel, which follow the cost computation above except for this case which
679  // is one instruction less:
680  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
681  DstTy->getScalarSizeInBits() == 8)
682  Cost--;
683 
684  return Cost;
685 }
686 
687 // Return the cost of converting a vector bitmask produced by a compare
688 // (SrcTy), to the type of the select or extend instruction (DstTy).
689 unsigned SystemZTTIImpl::
691  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
692  "Should only be called with vector types.");
693 
694  unsigned PackCost = 0;
695  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
696  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
697  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
698  if (SrcScalarBits > DstScalarBits)
699  // The bitmask will be truncated.
700  PackCost = getVectorTruncCost(SrcTy, DstTy);
701  else if (SrcScalarBits < DstScalarBits) {
702  unsigned DstNumParts = getNumVectorRegs(DstTy);
703  // Each vector select needs its part of the bitmask unpacked.
704  PackCost = Log2Diff * DstNumParts;
705  // Extra cost for moving part of mask before unpacking.
706  PackCost += DstNumParts - 1;
707  }
708 
709  return PackCost;
710 }
711 
712 // Return the type of the compared operands. This is needed to compute the
713 // cost for a Select / ZExt or SExt instruction.
714 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
715  Type *OpTy = nullptr;
716  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
717  OpTy = CI->getOperand(0)->getType();
718  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
719  if (LogicI->getNumOperands() == 2)
720  if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
721  if (isa<CmpInst>(LogicI->getOperand(1)))
722  OpTy = CI0->getOperand(0)->getType();
723 
724  if (OpTy != nullptr) {
725  if (VF == 1) {
726  assert (!OpTy->isVectorTy() && "Expected scalar type");
727  return OpTy;
728  }
729  // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
730  // be either scalar or already vectorized with a same or lesser VF.
731  Type *ElTy = OpTy->getScalarType();
732  return FixedVectorType::get(ElTy, VF);
733  }
734 
735  return nullptr;
736 }
737 
738 // Get the cost of converting a boolean vector to a vector with same width
739 // and element size as Dst, plus the cost of zero extending if needed.
740 unsigned SystemZTTIImpl::
741 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
742  const Instruction *I) {
743  auto *DstVTy = cast<FixedVectorType>(Dst);
744  unsigned VF = DstVTy->getNumElements();
745  unsigned Cost = 0;
746  // If we know what the widths of the compared operands, get any cost of
747  // converting it to match Dst. Otherwise assume same widths.
748  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
749  if (CmpOpTy != nullptr)
750  Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
751  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
752  // One 'vn' per dst vector with an immediate mask.
753  Cost += getNumVectorRegs(Dst);
754  return Cost;
755 }
756 
758  Type *Src,
761  const Instruction *I) {
762  // FIXME: Can the logic below also be used for these cost kinds?
764  auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
765  return BaseCost == 0 ? BaseCost : 1;
766  }
767 
768  unsigned DstScalarBits = Dst->getScalarSizeInBits();
769  unsigned SrcScalarBits = Src->getScalarSizeInBits();
770 
771  if (!Src->isVectorTy()) {
772  assert (!Dst->isVectorTy());
773 
774  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
775  if (SrcScalarBits >= 32 ||
776  (I != nullptr && isa<LoadInst>(I->getOperand(0))))
777  return 1;
778  return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
779  }
780 
781  if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
782  Src->isIntegerTy(1)) {
783  if (ST->hasLoadStoreOnCond2())
784  return 2; // li 0; loc 1
785 
786  // This should be extension of a compare i1 result, which is done with
787  // ipm and a varying sequence of instructions.
788  unsigned Cost = 0;
789  if (Opcode == Instruction::SExt)
790  Cost = (DstScalarBits < 64 ? 3 : 4);
791  if (Opcode == Instruction::ZExt)
792  Cost = 3;
793  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
794  if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
795  // If operands of an fp-type was compared, this costs +1.
796  Cost++;
797  return Cost;
798  }
799  }
800  else if (ST->hasVector()) {
801  // Vector to scalar cast.
802  auto *SrcVecTy = cast<FixedVectorType>(Src);
803  auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
804  if (!DstVecTy) {
805  // TODO: tune vector-to-scalar cast.
806  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
807  }
808  unsigned VF = SrcVecTy->getNumElements();
809  unsigned NumDstVectors = getNumVectorRegs(Dst);
810  unsigned NumSrcVectors = getNumVectorRegs(Src);
811 
812  if (Opcode == Instruction::Trunc) {
813  if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
814  return 0; // Check for NOOP conversions.
815  return getVectorTruncCost(Src, Dst);
816  }
817 
818  if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
819  if (SrcScalarBits >= 8) {
820  // ZExt will use either a single unpack or a vector permute.
821  if (Opcode == Instruction::ZExt)
822  return NumDstVectors;
823 
824  // SExt will be handled with one unpack per doubling of width.
825  unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
826 
827  // For types that spans multiple vector registers, some additional
828  // instructions are used to setup the unpacking.
829  unsigned NumSrcVectorOps =
830  (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
831  : (NumDstVectors / 2));
832 
833  return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
834  }
835  else if (SrcScalarBits == 1)
836  return getBoolVecToIntConversionCost(Opcode, Dst, I);
837  }
838 
839  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
840  Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
841  // TODO: Fix base implementation which could simplify things a bit here
842  // (seems to miss on differentiating on scalar/vector types).
843 
844  // Only 64 bit vector conversions are natively supported before z15.
845  if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
846  if (SrcScalarBits == DstScalarBits)
847  return NumDstVectors;
848 
849  if (SrcScalarBits == 1)
850  return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
851  }
852 
853  // Return the cost of multiple scalar invocation plus the cost of
854  // inserting and extracting the values. Base implementation does not
855  // realize float->int gets scalarized.
856  InstructionCost ScalarCost = getCastInstrCost(
857  Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
858  InstructionCost TotCost = VF * ScalarCost;
859  bool NeedsInserts = true, NeedsExtracts = true;
860  // FP128 registers do not get inserted or extracted.
861  if (DstScalarBits == 128 &&
862  (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
863  NeedsInserts = false;
864  if (SrcScalarBits == 128 &&
865  (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
866  NeedsExtracts = false;
867 
868  TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
869  TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
870 
871  // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
872  if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
873  TotCost *= 2;
874 
875  return TotCost;
876  }
877 
878  if (Opcode == Instruction::FPTrunc) {
879  if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
880  return VF /*ldxbr/lexbr*/ +
881  getScalarizationOverhead(DstVecTy, true, false);
882  else // double -> float
883  return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
884  }
885 
886  if (Opcode == Instruction::FPExt) {
887  if (SrcScalarBits == 32 && DstScalarBits == 64) {
888  // float -> double is very rare and currently unoptimized. Instead of
889  // using vldeb, which can do two at a time, all conversions are
890  // scalarized.
891  return VF * 2;
892  }
893  // -> fp128. VF * lxdb/lxeb + extraction of elements.
894  return VF + getScalarizationOverhead(SrcVecTy, false, true);
895  }
896  }
897 
898  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
899 }
900 
901 // Scalar i8 / i16 operations will typically be made after first extending
902 // the operands to i32.
903 static unsigned getOperandsExtensionCost(const Instruction *I) {
904  unsigned ExtCost = 0;
905  for (Value *Op : I->operands())
906  // A load of i8 or i16 sign/zero extends to i32.
907  if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
908  ExtCost++;
909 
910  return ExtCost;
911 }
912 
914  Type *CondTy,
915  CmpInst::Predicate VecPred,
917  const Instruction *I) {
919  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
920 
921  if (!ValTy->isVectorTy()) {
922  switch (Opcode) {
923  case Instruction::ICmp: {
924  // A loaded value compared with 0 with multiple users becomes Load and
925  // Test. The load is then not foldable, so return 0 cost for the ICmp.
926  unsigned ScalarBits = ValTy->getScalarSizeInBits();
927  if (I != nullptr && ScalarBits >= 32)
928  if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
929  if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
930  if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
931  C->isZero())
932  return 0;
933 
934  unsigned Cost = 1;
935  if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
936  Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
937  return Cost;
938  }
939  case Instruction::Select:
940  if (ValTy->isFloatingPointTy())
941  return 4; // No load on condition for FP - costs a conditional jump.
942  return 1; // Load On Condition / Select Register.
943  }
944  }
945  else if (ST->hasVector()) {
946  unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
947 
948  // Called with a compare instruction.
949  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
950  unsigned PredicateExtraCost = 0;
951  if (I != nullptr) {
952  // Some predicates cost one or two extra instructions.
953  switch (cast<CmpInst>(I)->getPredicate()) {
954  case CmpInst::Predicate::ICMP_NE:
955  case CmpInst::Predicate::ICMP_UGE:
956  case CmpInst::Predicate::ICMP_ULE:
957  case CmpInst::Predicate::ICMP_SGE:
958  case CmpInst::Predicate::ICMP_SLE:
959  PredicateExtraCost = 1;
960  break;
961  case CmpInst::Predicate::FCMP_ONE:
962  case CmpInst::Predicate::FCMP_ORD:
963  case CmpInst::Predicate::FCMP_UEQ:
964  case CmpInst::Predicate::FCMP_UNO:
965  PredicateExtraCost = 2;
966  break;
967  default:
968  break;
969  }
970  }
971 
972  // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
973  // floats. FIXME: <2 x float> generates same code as <4 x float>.
974  unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
975  unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
976 
977  unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
978  return Cost;
979  }
980  else { // Called with a select instruction.
981  assert (Opcode == Instruction::Select);
982 
983  // We can figure out the extra cost of packing / unpacking if the
984  // instruction was passed and the compare instruction is found.
985  unsigned PackCost = 0;
986  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
987  if (CmpOpTy != nullptr)
988  PackCost =
989  getVectorBitmaskConversionCost(CmpOpTy, ValTy);
990 
991  return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
992  }
993  }
994 
995  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
996 }
997 
999  unsigned Index) {
1000  // vlvgp will insert two grs into a vector register, so only count half the
1001  // number of instructions.
1002  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1003  return ((Index % 2 == 0) ? 1 : 0);
1004 
1005  if (Opcode == Instruction::ExtractElement) {
1006  int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1007 
1008  // Give a slight penalty for moving out of vector pipeline to FXU unit.
1009  if (Index == 0 && Val->isIntOrIntVectorTy())
1010  Cost += 1;
1011 
1012  return Cost;
1013  }
1014 
1015  return BaseT::getVectorInstrCost(Opcode, Val, Index);
1016 }
1017 
1018 // Check if a load may be folded as a memory operand in its user.
1019 bool SystemZTTIImpl::
1020 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1021  if (!Ld->hasOneUse())
1022  return false;
1023  FoldedValue = Ld;
1024  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1025  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1026  unsigned TruncBits = 0;
1027  unsigned SExtBits = 0;
1028  unsigned ZExtBits = 0;
1029  if (UserI->hasOneUse()) {
1030  unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1031  if (isa<TruncInst>(UserI))
1032  TruncBits = UserBits;
1033  else if (isa<SExtInst>(UserI))
1034  SExtBits = UserBits;
1035  else if (isa<ZExtInst>(UserI))
1036  ZExtBits = UserBits;
1037  }
1038  if (TruncBits || SExtBits || ZExtBits) {
1039  FoldedValue = UserI;
1040  UserI = cast<Instruction>(*UserI->user_begin());
1041  // Load (single use) -> trunc/extend (single use) -> UserI
1042  }
1043  if ((UserI->getOpcode() == Instruction::Sub ||
1044  UserI->getOpcode() == Instruction::SDiv ||
1045  UserI->getOpcode() == Instruction::UDiv) &&
1046  UserI->getOperand(1) != FoldedValue)
1047  return false; // Not commutative, only RHS foldable.
1048  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1049  // extension was made of the load.
1050  unsigned LoadOrTruncBits =
1051  ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1052  switch (UserI->getOpcode()) {
1053  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1054  case Instruction::Sub:
1055  case Instruction::ICmp:
1056  if (LoadedBits == 32 && ZExtBits == 64)
1057  return true;
1058  [[fallthrough]];
1059  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1060  if (UserI->getOpcode() != Instruction::ICmp) {
1061  if (LoadedBits == 16 &&
1062  (SExtBits == 32 ||
1063  (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1064  return true;
1065  if (LoadOrTruncBits == 16)
1066  return true;
1067  }
1068  [[fallthrough]];
1069  case Instruction::SDiv:// SE: 32->64
1070  if (LoadedBits == 32 && SExtBits == 64)
1071  return true;
1072  [[fallthrough]];
1073  case Instruction::UDiv:
1074  case Instruction::And:
1075  case Instruction::Or:
1076  case Instruction::Xor:
1077  // This also makes sense for float operations, but disabled for now due
1078  // to regressions.
1079  // case Instruction::FCmp:
1080  // case Instruction::FAdd:
1081  // case Instruction::FSub:
1082  // case Instruction::FMul:
1083  // case Instruction::FDiv:
1084 
1085  // All possible extensions of memory checked above.
1086 
1087  // Comparison between memory and immediate.
1088  if (UserI->getOpcode() == Instruction::ICmp)
1089  if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1090  if (CI->getValue().isIntN(16))
1091  return true;
1092  return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1093  break;
1094  }
1095  return false;
1096 }
1097 
1098 static bool isBswapIntrinsicCall(const Value *V) {
1099  if (const Instruction *I = dyn_cast<Instruction>(V))
1100  if (auto *CI = dyn_cast<CallInst>(I))
1101  if (auto *F = CI->getCalledFunction())
1102  if (F->getIntrinsicID() == Intrinsic::bswap)
1103  return true;
1104  return false;
1105 }
1106 
1108  MaybeAlign Alignment,
1109  unsigned AddressSpace,
1111  TTI::OperandValueInfo OpInfo,
1112  const Instruction *I) {
1113  assert(!Src->isVoidTy() && "Invalid type");
1114 
1115  // TODO: Handle other cost kinds.
1117  return 1;
1118 
1119  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1120  // Store the load or its truncated or extended value in FoldedValue.
1121  const Instruction *FoldedValue = nullptr;
1122  if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1123  const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1124  assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1125 
1126  // UserI can't fold two loads, so in that case return 0 cost only
1127  // half of the time.
1128  for (unsigned i = 0; i < 2; ++i) {
1129  if (UserI->getOperand(i) == FoldedValue)
1130  continue;
1131 
1132  if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1133  LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1134  if (!OtherLoad &&
1135  (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1136  isa<ZExtInst>(OtherOp)))
1137  OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1138  if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1139  return i == 0; // Both operands foldable.
1140  }
1141  }
1142 
1143  return 0; // Only I is foldable in user.
1144  }
1145  }
1146 
1147  unsigned NumOps =
1148  (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1149 
1150  // Store/Load reversed saves one instruction.
1151  if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1152  I != nullptr) {
1153  if (Opcode == Instruction::Load && I->hasOneUse()) {
1154  const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1155  // In case of load -> bswap -> store, return normal cost for the load.
1156  if (isBswapIntrinsicCall(LdUser) &&
1157  (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1158  return 0;
1159  }
1160  else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1161  const Value *StoredVal = SI->getValueOperand();
1162  if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1163  return 0;
1164  }
1165  }
1166 
1167  if (Src->getScalarSizeInBits() == 128)
1168  // 128 bit scalars are held in a pair of two 64 bit registers.
1169  NumOps *= 2;
1170 
1171  return NumOps;
1172 }
1173 
1174 // The generic implementation of getInterleavedMemoryOpCost() is based on
1175 // adding costs of the memory operations plus all the extracts and inserts
1176 // needed for using / defining the vector operands. The SystemZ version does
1177 // roughly the same but bases the computations on vector permutations
1178 // instead.
1180  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1181  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1182  bool UseMaskForCond, bool UseMaskForGaps) {
1183  if (UseMaskForCond || UseMaskForGaps)
1184  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1185  Alignment, AddressSpace, CostKind,
1186  UseMaskForCond, UseMaskForGaps);
1187  assert(isa<VectorType>(VecTy) &&
1188  "Expect a vector type for interleaved memory op");
1189 
1190  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1191  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1192  unsigned VF = NumElts / Factor;
1193  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1194  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1195  unsigned NumPermutes = 0;
1196 
1197  if (Opcode == Instruction::Load) {
1198  // Loading interleave groups may have gaps, which may mean fewer
1199  // loads. Find out how many vectors will be loaded in total, and in how
1200  // many of them each value will be in.
1201  BitVector UsedInsts(NumVectorMemOps, false);
1202  std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1203  for (unsigned Index : Indices)
1204  for (unsigned Elt = 0; Elt < VF; ++Elt) {
1205  unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1206  UsedInsts.set(Vec);
1207  ValueVecs[Index].set(Vec);
1208  }
1209  NumVectorMemOps = UsedInsts.count();
1210 
1211  for (unsigned Index : Indices) {
1212  // Estimate that each loaded source vector containing this Index
1213  // requires one operation, except that vperm can handle two input
1214  // registers first time for each dst vector.
1215  unsigned NumSrcVecs = ValueVecs[Index].count();
1216  unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1217  assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1218  NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1219  }
1220  } else {
1221  // Estimate the permutes for each stored vector as the smaller of the
1222  // number of elements and the number of source vectors. Subtract one per
1223  // dst vector for vperm (S.A.).
1224  unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1225  unsigned NumDstVecs = NumVectorMemOps;
1226  assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1227  NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1228  }
1229 
1230  // Cost of load/store operations and the permutations needed.
1231  return NumVectorMemOps + NumPermutes;
1232 }
1233 
1235  if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1236  return getNumVectorRegs(RetTy); // VPERM
1237  return -1;
1238 }
1239 
1245  if (Cost != -1)
1246  return Cost;
1248 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:583
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::BasicTTIImplBase< SystemZTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:471
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:217
llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition: TargetTransformInfo.h:404
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:439
llvm::SystemZSubtarget::hasLoadStoreOnCond2
bool hasLoadStoreOnCond2() const
Definition: SystemZSubtarget.h:148
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SystemZTTIImpl::getBoolVecToIntConversionCost
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:741
llvm::SystemZTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:1107
llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
llvm::InstructionCost::getValue
Optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:87
getVectorIntrinsicInstrCost
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
Definition: SystemZTargetTransformInfo.cpp:1234
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:455
IntrinsicInst.h
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:444
llvm::Function
Definition: Function.h:60
llvm::SystemZSubtarget::hasVector
bool hasVector() const
Definition: SystemZSubtarget.h:214
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:546
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BitVector::set
BitVector & set()
Definition: BitVector.h:344
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:583
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:425
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:314
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1181
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:152
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:962
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:220
llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition: TargetTransformInfo.h:406
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:467
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:526
llvm::BasicTTIImplBase< SystemZTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:897
llvm::TargetTransformInfo::UnrollingPreferences::FullUnrollMaxCount
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
Definition: TargetTransformInfo.h:459
Vector
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
llvm::SystemZInstrInfo
Definition: SystemZInstrInfo.h:174
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:458
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:168
llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:921
llvm::TargetTransformInfo::UnrollingPreferences::AllowExpensiveTripCount
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
Definition: TargetTransformInfo.h:476
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::SystemZTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: SystemZTargetTransformInfo.cpp:354
llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition: TargetTransformInfo.h:403
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:885
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition: TargetTransformInfo.h:405
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
getCmpOpsType
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
Definition: SystemZTargetTransformInfo.cpp:714
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition: TargetTransformInfo.h:409
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::SystemZTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: SystemZTargetTransformInfo.cpp:998
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:164
llvm::SystemZSubtarget::hasMiscellaneousExtensions3
bool hasMiscellaneousExtensions3() const
Definition: SystemZSubtarget.h:244
getScalarSizeInBits
static unsigned getScalarSizeInBits(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:403
llvm::SystemZTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:757
llvm::BasicTTIImplBase< SystemZTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1181
InlinePriorityMode::Cost
@ Cost
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:884
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1136
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:479
llvm::BitVector::count
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:155
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::SystemZTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: SystemZTargetTransformInfo.cpp:593
llvm::M68kBeads::Bits1
@ Bits1
Definition: M68kBaseInfo.h:54
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:194
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:232
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:547
llvm::BasicTTIImplBase< SystemZTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:819
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::SystemZTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:1241
llvm::BitVector
Definition: BitVector.h:75
isUsedAsMemCpySource
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
Definition: SystemZTargetTransformInfo.cpp:33
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:962
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1131
llvm::SystemZTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: SystemZTargetTransformInfo.cpp:52
llvm::SystemZTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:913
getNumVectorRegs
static unsigned getNumVectorRegs(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:413
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:963
llvm::None
const NoneType None
Definition: None.h:24
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:119
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:583
llvm::SystemZSubtarget::hasVectorEnhancements2
bool hasVectorEnhancements2() const
Definition: SystemZSubtarget.h:253
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:709
llvm::SystemZTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: SystemZTargetTransformInfo.cpp:366
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:298
llvm::divideCeil
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:683
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::SystemZTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: SystemZTargetTransformInfo.cpp:337
Index
uint32_t Index
Definition: ELFObjHandler.cpp:82
getOperandsExtensionCost
static unsigned getOperandsExtensionCost(const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:903
llvm::TargetTransformInfo::LSRCost
Definition: TargetTransformInfo.h:400
llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition: TargetLowering.h:936
llvm::SystemZTTIImpl::isFoldableLoad
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
Definition: SystemZTargetTransformInfo.cpp:1020
llvm::SystemZTTIImpl::getVectorBitmaskConversionCost
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:690
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:414
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:154
llvm::SystemZTTIImpl::getVectorTruncCost
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:648
llvm::SystemZTTIImpl::isLSRCostLess
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
Definition: SystemZTargetTransformInfo.cpp:342
llvm::BasicTTIImplBase< SystemZTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:933
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SystemZTargetTransformInfo.h
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
SI
StandardInstrumentations SI(Debug, VerifyEach)
isBswapIntrinsicCall
static bool isBswapIntrinsicCall(const Value *V)
Definition: SystemZTargetTransformInfo.cpp:1098
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:121
getElSizeLog2Diff
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
Definition: SystemZTargetTransformInfo.cpp:636
llvm::SystemZSubtarget::hasVectorEnhancements1
bool hasVectorEnhancements1() const
Definition: SystemZSubtarget.h:233
llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition: TargetTransformInfo.h:410
llvm::BasicTTIImplBase< SystemZTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1285
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:243
llvm::SystemZTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:69
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::SystemZTTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:223
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::BasicTTIImplBase< SystemZTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:701
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:221
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:450
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:229
llvm::BasicTTIImplBase< SystemZTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:590
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::SystemZTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: SystemZTargetTransformInfo.cpp:420
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::PPC::getPredicate
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
Definition: PPCPredicates.h:87
llvm::SystemZTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: SystemZTargetTransformInfo.cpp:1179
llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition: TargetTransformInfo.h:407
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
llvm::SystemZTTIImpl::getMinPrefetchStride
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
Definition: SystemZTargetTransformInfo.cpp:379
llvm::SystemZSubtarget::hasPopulationCount
bool hasPopulationCount() const
Definition: SystemZSubtarget.h:157
llvm::SystemZSubtarget::hasMiscellaneousExtensions2
bool hasMiscellaneousExtensions2() const
Definition: SystemZSubtarget.h:217
CostTable.h
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:202
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:435
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::SystemZSubtarget::getInstrInfo
const SystemZInstrInfo * getInstrInfo() const override
Definition: SystemZSubtarget.h:113
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:962
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:1045
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:150
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:962
DataType
llvm::BasicTTIImplBase< SystemZTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1429
llvm::BasicTTIImplBase< SystemZTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp)
Definition: BasicTTIImpl.h:2155
TargetTransformInfo.h
llvm::SystemZTTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: SystemZTargetTransformInfo.cpp:274
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1174
llvm::SystemZTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: SystemZTargetTransformInfo.cpp:102
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:244
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1490
llvm::SystemZTTIImpl::hasDivRemOp
bool hasDivRemOp(Type *DataType, bool IsSigned)
Definition: SystemZTargetTransformInfo.cpp:396
llvm::SystemZTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: SystemZTargetTransformInfo.cpp:281
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:218
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:892
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38