LLVM  15.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
19 #include "llvm/CodeGen/CostTable.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
33 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
34  bool UsedAsMemCpySource = false;
35  for (const User *U : V->users())
36  if (const Instruction *User = dyn_cast<Instruction>(U)) {
37  if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
38  UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
39  continue;
40  }
41  if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
42  if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
43  UsedAsMemCpySource = true;
44  continue;
45  }
46  }
47  OtherUse = true;
48  }
49  return UsedAsMemCpySource;
50 }
51 
53  unsigned Bonus = 0;
54 
55  // Increase the threshold if an incoming argument is used only as a memcpy
56  // source.
57  if (Function *Callee = CB->getCalledFunction())
58  for (Argument &Arg : Callee->args()) {
59  bool OtherUse = false;
60  if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
61  Bonus += 150;
62  }
63 
64  LLVM_DEBUG(if (Bonus)
65  dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
66  return Bonus;
67 }
68 
71  assert(Ty->isIntegerTy());
72 
73  unsigned BitSize = Ty->getPrimitiveSizeInBits();
74  // There is no cost model for constants with a bit size of 0. Return TCC_Free
75  // here, so that constant hoisting will ignore this constant.
76  if (BitSize == 0)
77  return TTI::TCC_Free;
78  // No cost model for operations on integers larger than 64 bit implemented yet.
79  if (BitSize > 64)
80  return TTI::TCC_Free;
81 
82  if (Imm == 0)
83  return TTI::TCC_Free;
84 
85  if (Imm.getBitWidth() <= 64) {
86  // Constants loaded via lgfi.
87  if (isInt<32>(Imm.getSExtValue()))
88  return TTI::TCC_Basic;
89  // Constants loaded via llilf.
90  if (isUInt<32>(Imm.getZExtValue()))
91  return TTI::TCC_Basic;
92  // Constants loaded via llihf:
93  if ((Imm.getZExtValue() & 0xffffffff) == 0)
94  return TTI::TCC_Basic;
95 
96  return 2 * TTI::TCC_Basic;
97  }
98 
99  return 4 * TTI::TCC_Basic;
100 }
101 
103  const APInt &Imm, Type *Ty,
105  Instruction *Inst) {
106  assert(Ty->isIntegerTy());
107 
108  unsigned BitSize = Ty->getPrimitiveSizeInBits();
109  // There is no cost model for constants with a bit size of 0. Return TCC_Free
110  // here, so that constant hoisting will ignore this constant.
111  if (BitSize == 0)
112  return TTI::TCC_Free;
113  // No cost model for operations on integers larger than 64 bit implemented yet.
114  if (BitSize > 64)
115  return TTI::TCC_Free;
116 
117  switch (Opcode) {
118  default:
119  return TTI::TCC_Free;
120  case Instruction::GetElementPtr:
121  // Always hoist the base address of a GetElementPtr. This prevents the
122  // creation of new constants for every base constant that gets constant
123  // folded with the offset.
124  if (Idx == 0)
125  return 2 * TTI::TCC_Basic;
126  return TTI::TCC_Free;
127  case Instruction::Store:
128  if (Idx == 0 && Imm.getBitWidth() <= 64) {
129  // Any 8-bit immediate store can by implemented via mvi.
130  if (BitSize == 8)
131  return TTI::TCC_Free;
132  // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
133  if (isInt<16>(Imm.getSExtValue()))
134  return TTI::TCC_Free;
135  }
136  break;
137  case Instruction::ICmp:
138  if (Idx == 1 && Imm.getBitWidth() <= 64) {
139  // Comparisons against signed 32-bit immediates implemented via cgfi.
140  if (isInt<32>(Imm.getSExtValue()))
141  return TTI::TCC_Free;
142  // Comparisons against unsigned 32-bit immediates implemented via clgfi.
143  if (isUInt<32>(Imm.getZExtValue()))
144  return TTI::TCC_Free;
145  }
146  break;
147  case Instruction::Add:
148  case Instruction::Sub:
149  if (Idx == 1 && Imm.getBitWidth() <= 64) {
150  // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
151  if (isUInt<32>(Imm.getZExtValue()))
152  return TTI::TCC_Free;
153  // Or their negation, by swapping addition vs. subtraction.
154  if (isUInt<32>(-Imm.getSExtValue()))
155  return TTI::TCC_Free;
156  }
157  break;
158  case Instruction::Mul:
159  if (Idx == 1 && Imm.getBitWidth() <= 64) {
160  // We use msgfi to multiply by 32-bit signed immediates.
161  if (isInt<32>(Imm.getSExtValue()))
162  return TTI::TCC_Free;
163  }
164  break;
165  case Instruction::Or:
166  case Instruction::Xor:
167  if (Idx == 1 && Imm.getBitWidth() <= 64) {
168  // Masks supported by oilf/xilf.
169  if (isUInt<32>(Imm.getZExtValue()))
170  return TTI::TCC_Free;
171  // Masks supported by oihf/xihf.
172  if ((Imm.getZExtValue() & 0xffffffff) == 0)
173  return TTI::TCC_Free;
174  }
175  break;
176  case Instruction::And:
177  if (Idx == 1 && Imm.getBitWidth() <= 64) {
178  // Any 32-bit AND operation can by implemented via nilf.
179  if (BitSize <= 32)
180  return TTI::TCC_Free;
181  // 64-bit masks supported by nilf.
182  if (isUInt<32>(~Imm.getZExtValue()))
183  return TTI::TCC_Free;
184  // 64-bit masks supported by nilh.
185  if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
186  return TTI::TCC_Free;
187  // Some 64-bit AND operations can be implemented via risbg.
188  const SystemZInstrInfo *TII = ST->getInstrInfo();
189  unsigned Start, End;
190  if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
191  return TTI::TCC_Free;
192  }
193  break;
194  case Instruction::Shl:
195  case Instruction::LShr:
196  case Instruction::AShr:
197  // Always return TCC_Free for the shift value of a shift instruction.
198  if (Idx == 1)
199  return TTI::TCC_Free;
200  break;
201  case Instruction::UDiv:
202  case Instruction::SDiv:
203  case Instruction::URem:
204  case Instruction::SRem:
205  case Instruction::Trunc:
206  case Instruction::ZExt:
207  case Instruction::SExt:
208  case Instruction::IntToPtr:
209  case Instruction::PtrToInt:
210  case Instruction::BitCast:
211  case Instruction::PHI:
212  case Instruction::Call:
213  case Instruction::Select:
214  case Instruction::Ret:
215  case Instruction::Load:
216  break;
217  }
218 
219  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
220 }
221 
224  const APInt &Imm, Type *Ty,
226  assert(Ty->isIntegerTy());
227 
228  unsigned BitSize = Ty->getPrimitiveSizeInBits();
229  // There is no cost model for constants with a bit size of 0. Return TCC_Free
230  // here, so that constant hoisting will ignore this constant.
231  if (BitSize == 0)
232  return TTI::TCC_Free;
233  // No cost model for operations on integers larger than 64 bit implemented yet.
234  if (BitSize > 64)
235  return TTI::TCC_Free;
236 
237  switch (IID) {
238  default:
239  return TTI::TCC_Free;
240  case Intrinsic::sadd_with_overflow:
241  case Intrinsic::uadd_with_overflow:
242  case Intrinsic::ssub_with_overflow:
243  case Intrinsic::usub_with_overflow:
244  // These get expanded to include a normal addition/subtraction.
245  if (Idx == 1 && Imm.getBitWidth() <= 64) {
246  if (isUInt<32>(Imm.getZExtValue()))
247  return TTI::TCC_Free;
248  if (isUInt<32>(-Imm.getSExtValue()))
249  return TTI::TCC_Free;
250  }
251  break;
252  case Intrinsic::smul_with_overflow:
253  case Intrinsic::umul_with_overflow:
254  // These get expanded to include a normal multiplication.
255  if (Idx == 1 && Imm.getBitWidth() <= 64) {
256  if (isInt<32>(Imm.getSExtValue()))
257  return TTI::TCC_Free;
258  }
259  break;
260  case Intrinsic::experimental_stackmap:
261  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
262  return TTI::TCC_Free;
263  break;
264  case Intrinsic::experimental_patchpoint_void:
265  case Intrinsic::experimental_patchpoint_i64:
266  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
267  return TTI::TCC_Free;
268  break;
269  }
270  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
271 }
272 
275  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
276  if (ST->hasPopulationCount() && TyWidth <= 64)
277  return TTI::PSK_FastHardware;
278  return TTI::PSK_Software;
279 }
280 
284  // Find out if L contains a call, what the machine instruction count
285  // estimate is, and how many stores there are.
286  bool HasCall = false;
287  InstructionCost NumStores = 0;
288  for (auto &BB : L->blocks())
289  for (auto &I : *BB) {
290  if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
291  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
292  if (isLoweredToCall(F))
293  HasCall = true;
294  if (F->getIntrinsicID() == Intrinsic::memcpy ||
295  F->getIntrinsicID() == Intrinsic::memset)
296  NumStores++;
297  } else { // indirect call.
298  HasCall = true;
299  }
300  }
301  if (isa<StoreInst>(&I)) {
302  Type *MemAccessTy = I.getOperand(0)->getType();
303  NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
305  }
306  }
307 
308  // The z13 processor will run out of store tags if too many stores
309  // are fed into it too quickly. Therefore make sure there are not
310  // too many stores in the resulting unrolled loop.
311  unsigned const NumStoresVal = *NumStores.getValue();
312  unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
313 
314  if (HasCall) {
315  // Only allow full unrolling if loop has any calls.
316  UP.FullUnrollMaxCount = Max;
317  UP.MaxCount = 1;
318  return;
319  }
320 
321  UP.MaxCount = Max;
322  if (UP.MaxCount <= 1)
323  return;
324 
325  // Allow partial and runtime trip count unrolling.
326  UP.Partial = UP.Runtime = true;
327 
328  UP.PartialThreshold = 75;
330 
331  // Allow expensive instructions in the pre-header of the loop.
332  UP.AllowExpensiveTripCount = true;
333 
334  UP.Force = true;
335 }
336 
339  BaseT::getPeelingPreferences(L, SE, PP);
340 }
341 
344  // SystemZ specific: check instruction count (first), and don't care about
345  // ImmCost, since offsets are checked explicitly.
346  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
347  C1.NumIVMuls, C1.NumBaseAdds,
348  C1.ScaleCost, C1.SetupCost) <
349  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
350  C2.NumIVMuls, C2.NumBaseAdds,
351  C2.ScaleCost, C2.SetupCost);
352 }
353 
354 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
355  bool Vector = (ClassID == 1);
356  if (!Vector)
357  // Discount the stack pointer. Also leave out %r0, since it can't
358  // be used in an address.
359  return 14;
360  if (ST->hasVector())
361  return 32;
362  return 0;
363 }
364 
365 TypeSize
367  switch (K) {
369  return TypeSize::getFixed(64);
371  return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
373  return TypeSize::getScalable(0);
374  }
375 
376  llvm_unreachable("Unsupported register kind");
377 }
378 
379 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
380  unsigned NumStridedMemAccesses,
381  unsigned NumPrefetches,
382  bool HasCall) const {
383  // Don't prefetch a loop with many far apart accesses.
384  if (NumPrefetches > 16)
385  return UINT_MAX;
386 
387  // Emit prefetch instructions for smaller strides in cases where we think
388  // the hardware prefetcher might not be able to keep up.
389  if (NumStridedMemAccesses > 32 && !HasCall &&
390  (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
391  return 1;
392 
393  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
394 }
395 
396 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
397  EVT VT = TLI->getValueType(DL, DataType);
398  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
399 }
400 
401 // Return the bit size for the scalar type or vector element
402 // type. getScalarSizeInBits() returns 0 for a pointer type.
403 static unsigned getScalarSizeInBits(Type *Ty) {
404  unsigned Size =
405  (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
406  assert(Size > 0 && "Element must have non-zero size.");
407  return Size;
408 }
409 
410 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
411 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
412 // 3.
413 static unsigned getNumVectorRegs(Type *Ty) {
414  auto *VTy = cast<FixedVectorType>(Ty);
415  unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
416  assert(WideBits > 0 && "Could not compute size of vector");
417  return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
418 }
419 
421  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
423  TTI::OperandValueProperties Opd1PropInfo,
425  const Instruction *CxtI) {
426 
427  // TODO: Handle more cost kinds.
429  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
430  Op2Info, Opd1PropInfo,
431  Opd2PropInfo, Args, CxtI);
432 
433  // TODO: return a good value for BB-VECTORIZER that includes the
434  // immediate loads, which we do not want to count for the loop
435  // vectorizer, since they are hopefully hoisted out of the loop. This
436  // would require a new parameter 'InLoop', but not sure if constant
437  // args are common enough to motivate this.
438 
439  unsigned ScalarBits = Ty->getScalarSizeInBits();
440 
441  // There are thre cases of division and remainder: Dividing with a register
442  // needs a divide instruction. A divisor which is a power of two constant
443  // can be implemented with a sequence of shifts. Any other constant needs a
444  // multiply and shifts.
445  const unsigned DivInstrCost = 20;
446  const unsigned DivMulSeqCost = 10;
447  const unsigned SDivPow2Cost = 4;
448 
449  bool SignedDivRem =
450  Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
451  bool UnsignedDivRem =
452  Opcode == Instruction::UDiv || Opcode == Instruction::URem;
453 
454  // Check for a constant divisor.
455  bool DivRemConst = false;
456  bool DivRemConstPow2 = false;
457  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
458  if (const Constant *C = dyn_cast<Constant>(Args[1])) {
459  const ConstantInt *CVal =
460  (C->getType()->isVectorTy()
461  ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
462  : dyn_cast<const ConstantInt>(C));
463  if (CVal && (CVal->getValue().isPowerOf2() ||
464  CVal->getValue().isNegatedPowerOf2()))
465  DivRemConstPow2 = true;
466  else
467  DivRemConst = true;
468  }
469  }
470 
471  if (!Ty->isVectorTy()) {
472  // These FP operations are supported with a dedicated instruction for
473  // float, double and fp128 (base implementation assumes float generally
474  // costs 2).
475  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
476  Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
477  return 1;
478 
479  // There is no native support for FRem.
480  if (Opcode == Instruction::FRem)
481  return LIBCALL_COST;
482 
483  // Give discount for some combined logical operations if supported.
484  if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
485  if (Opcode == Instruction::Xor) {
486  for (const Value *A : Args) {
487  if (const Instruction *I = dyn_cast<Instruction>(A))
488  if (I->hasOneUse() &&
489  (I->getOpcode() == Instruction::And ||
490  I->getOpcode() == Instruction::Or ||
491  I->getOpcode() == Instruction::Xor))
492  return 0;
493  }
494  }
495  else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
496  for (const Value *A : Args) {
497  if (const Instruction *I = dyn_cast<Instruction>(A))
498  if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
499  return 0;
500  }
501  }
502  }
503 
504  // Or requires one instruction, although it has custom handling for i64.
505  if (Opcode == Instruction::Or)
506  return 1;
507 
508  if (Opcode == Instruction::Xor && ScalarBits == 1) {
509  if (ST->hasLoadStoreOnCond2())
510  return 5; // 2 * (li 0; loc 1); xor
511  return 7; // 2 * ipm sequences ; xor ; shift ; compare
512  }
513 
514  if (DivRemConstPow2)
515  return (SignedDivRem ? SDivPow2Cost : 1);
516  if (DivRemConst)
517  return DivMulSeqCost;
518  if (SignedDivRem || UnsignedDivRem)
519  return DivInstrCost;
520  }
521  else if (ST->hasVector()) {
522  auto *VTy = cast<FixedVectorType>(Ty);
523  unsigned VF = VTy->getNumElements();
524  unsigned NumVectors = getNumVectorRegs(Ty);
525 
526  // These vector operations are custom handled, but are still supported
527  // with one instruction per vector, regardless of element size.
528  if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
529  Opcode == Instruction::AShr) {
530  return NumVectors;
531  }
532 
533  if (DivRemConstPow2)
534  return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
535  if (DivRemConst) {
536  SmallVector<Type *> Tys(Args.size(), Ty);
537  return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
538  }
539  if ((SignedDivRem || UnsignedDivRem) && VF > 4)
540  // Temporary hack: disable high vectorization factors with integer
541  // division/remainder, which will get scalarized and handled with
542  // GR128 registers. The mischeduler is not clever enough to avoid
543  // spilling yet.
544  return 1000;
545 
546  // These FP operations are supported with a single vector instruction for
547  // double (base implementation assumes float generally costs 2). For
548  // FP128, the scalar cost is 1, and there is no overhead since the values
549  // are already in scalar registers.
550  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
551  Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
552  switch (ScalarBits) {
553  case 32: {
554  // The vector enhancements facility 1 provides v4f32 instructions.
555  if (ST->hasVectorEnhancements1())
556  return NumVectors;
557  // Return the cost of multiple scalar invocation plus the cost of
558  // inserting and extracting the values.
559  InstructionCost ScalarCost =
561  SmallVector<Type *> Tys(Args.size(), Ty);
562  InstructionCost Cost =
563  (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
564  // FIXME: VF 2 for these FP operations are currently just as
565  // expensive as for VF 4.
566  if (VF == 2)
567  Cost *= 2;
568  return Cost;
569  }
570  case 64:
571  case 128:
572  return NumVectors;
573  default:
574  break;
575  }
576  }
577 
578  // There is no native support for FRem.
579  if (Opcode == Instruction::FRem) {
580  SmallVector<Type *> Tys(Args.size(), Ty);
581  InstructionCost Cost =
582  (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
583  // FIXME: VF 2 for float is currently just as expensive as for VF 4.
584  if (VF == 2 && ScalarBits == 32)
585  Cost *= 2;
586  return Cost;
587  }
588  }
589 
590  // Fallback to the default implementation.
591  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
592  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
593 }
594 
596  VectorType *Tp,
597  ArrayRef<int> Mask, int Index,
598  VectorType *SubTp,
601  if (ST->hasVector()) {
602  unsigned NumVectors = getNumVectorRegs(Tp);
603 
604  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
605 
606  // FP128 values are always in scalar registers, so there is no work
607  // involved with a shuffle, except for broadcast. In that case register
608  // moves are done with a single instruction per element.
609  if (Tp->getScalarType()->isFP128Ty())
610  return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
611 
612  switch (Kind) {
614  // ExtractSubvector Index indicates start offset.
615 
616  // Extracting a subvector from first index is a noop.
617  return (Index == 0 ? 0 : NumVectors);
618 
620  // Loop vectorizer calls here to figure out the extra cost of
621  // broadcasting a loaded value to all elements of a vector. Since vlrep
622  // loads and replicates with a single instruction, adjust the returned
623  // value.
624  return NumVectors - 1;
625 
626  default:
627 
628  // SystemZ supports single instruction permutation / replication.
629  return NumVectors;
630  }
631  }
632 
633  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
634 }
635 
636 // Return the log2 difference of the element sizes of the two vector types.
637 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
638  unsigned Bits0 = Ty0->getScalarSizeInBits();
639  unsigned Bits1 = Ty1->getScalarSizeInBits();
640 
641  if (Bits1 > Bits0)
642  return (Log2_32(Bits1) - Log2_32(Bits0));
643 
644  return (Log2_32(Bits0) - Log2_32(Bits1));
645 }
646 
647 // Return the number of instructions needed to truncate SrcTy to DstTy.
648 unsigned SystemZTTIImpl::
649 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
650  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
653  "Packing must reduce size of vector type.");
654  assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
655  cast<FixedVectorType>(DstTy)->getNumElements() &&
656  "Packing should not change number of elements.");
657 
658  // TODO: Since fp32 is expanded, the extract cost should always be 0.
659 
660  unsigned NumParts = getNumVectorRegs(SrcTy);
661  if (NumParts <= 2)
662  // Up to 2 vector registers can be truncated efficiently with pack or
663  // permute. The latter requires an immediate mask to be loaded, which
664  // typically gets hoisted out of a loop. TODO: return a good value for
665  // BB-VECTORIZER that includes the immediate loads, which we do not want
666  // to count for the loop vectorizer.
667  return 1;
668 
669  unsigned Cost = 0;
670  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
671  unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
672  for (unsigned P = 0; P < Log2Diff; ++P) {
673  if (NumParts > 1)
674  NumParts /= 2;
675  Cost += NumParts;
676  }
677 
678  // Currently, a general mix of permutes and pack instructions is output by
679  // isel, which follow the cost computation above except for this case which
680  // is one instruction less:
681  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
682  DstTy->getScalarSizeInBits() == 8)
683  Cost--;
684 
685  return Cost;
686 }
687 
688 // Return the cost of converting a vector bitmask produced by a compare
689 // (SrcTy), to the type of the select or extend instruction (DstTy).
690 unsigned SystemZTTIImpl::
692  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
693  "Should only be called with vector types.");
694 
695  unsigned PackCost = 0;
696  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
697  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
698  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
699  if (SrcScalarBits > DstScalarBits)
700  // The bitmask will be truncated.
701  PackCost = getVectorTruncCost(SrcTy, DstTy);
702  else if (SrcScalarBits < DstScalarBits) {
703  unsigned DstNumParts = getNumVectorRegs(DstTy);
704  // Each vector select needs its part of the bitmask unpacked.
705  PackCost = Log2Diff * DstNumParts;
706  // Extra cost for moving part of mask before unpacking.
707  PackCost += DstNumParts - 1;
708  }
709 
710  return PackCost;
711 }
712 
713 // Return the type of the compared operands. This is needed to compute the
714 // cost for a Select / ZExt or SExt instruction.
715 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
716  Type *OpTy = nullptr;
717  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
718  OpTy = CI->getOperand(0)->getType();
719  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
720  if (LogicI->getNumOperands() == 2)
721  if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
722  if (isa<CmpInst>(LogicI->getOperand(1)))
723  OpTy = CI0->getOperand(0)->getType();
724 
725  if (OpTy != nullptr) {
726  if (VF == 1) {
727  assert (!OpTy->isVectorTy() && "Expected scalar type");
728  return OpTy;
729  }
730  // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
731  // be either scalar or already vectorized with a same or lesser VF.
732  Type *ElTy = OpTy->getScalarType();
733  return FixedVectorType::get(ElTy, VF);
734  }
735 
736  return nullptr;
737 }
738 
739 // Get the cost of converting a boolean vector to a vector with same width
740 // and element size as Dst, plus the cost of zero extending if needed.
741 unsigned SystemZTTIImpl::
742 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
743  const Instruction *I) {
744  auto *DstVTy = cast<FixedVectorType>(Dst);
745  unsigned VF = DstVTy->getNumElements();
746  unsigned Cost = 0;
747  // If we know what the widths of the compared operands, get any cost of
748  // converting it to match Dst. Otherwise assume same widths.
749  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
750  if (CmpOpTy != nullptr)
751  Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
752  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
753  // One 'vn' per dst vector with an immediate mask.
754  Cost += getNumVectorRegs(Dst);
755  return Cost;
756 }
757 
759  Type *Src,
762  const Instruction *I) {
763  // FIXME: Can the logic below also be used for these cost kinds?
765  auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
766  return BaseCost == 0 ? BaseCost : 1;
767  }
768 
769  unsigned DstScalarBits = Dst->getScalarSizeInBits();
770  unsigned SrcScalarBits = Src->getScalarSizeInBits();
771 
772  if (!Src->isVectorTy()) {
773  assert (!Dst->isVectorTy());
774 
775  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
776  if (SrcScalarBits >= 32 ||
777  (I != nullptr && isa<LoadInst>(I->getOperand(0))))
778  return 1;
779  return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
780  }
781 
782  if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
783  Src->isIntegerTy(1)) {
784  if (ST->hasLoadStoreOnCond2())
785  return 2; // li 0; loc 1
786 
787  // This should be extension of a compare i1 result, which is done with
788  // ipm and a varying sequence of instructions.
789  unsigned Cost = 0;
790  if (Opcode == Instruction::SExt)
791  Cost = (DstScalarBits < 64 ? 3 : 4);
792  if (Opcode == Instruction::ZExt)
793  Cost = 3;
794  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
795  if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
796  // If operands of an fp-type was compared, this costs +1.
797  Cost++;
798  return Cost;
799  }
800  }
801  else if (ST->hasVector()) {
802  // Vector to scalar cast.
803  auto *SrcVecTy = cast<FixedVectorType>(Src);
804  auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
805  if (!DstVecTy) {
806  // TODO: tune vector-to-scalar cast.
807  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
808  }
809  unsigned VF = SrcVecTy->getNumElements();
810  unsigned NumDstVectors = getNumVectorRegs(Dst);
811  unsigned NumSrcVectors = getNumVectorRegs(Src);
812 
813  if (Opcode == Instruction::Trunc) {
814  if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
815  return 0; // Check for NOOP conversions.
816  return getVectorTruncCost(Src, Dst);
817  }
818 
819  if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
820  if (SrcScalarBits >= 8) {
821  // ZExt/SExt will be handled with one unpack per doubling of width.
822  unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
823 
824  // For types that spans multiple vector registers, some additional
825  // instructions are used to setup the unpacking.
826  unsigned NumSrcVectorOps =
827  (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
828  : (NumDstVectors / 2));
829 
830  return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
831  }
832  else if (SrcScalarBits == 1)
833  return getBoolVecToIntConversionCost(Opcode, Dst, I);
834  }
835 
836  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
837  Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
838  // TODO: Fix base implementation which could simplify things a bit here
839  // (seems to miss on differentiating on scalar/vector types).
840 
841  // Only 64 bit vector conversions are natively supported before z15.
842  if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
843  if (SrcScalarBits == DstScalarBits)
844  return NumDstVectors;
845 
846  if (SrcScalarBits == 1)
847  return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
848  }
849 
850  // Return the cost of multiple scalar invocation plus the cost of
851  // inserting and extracting the values. Base implementation does not
852  // realize float->int gets scalarized.
853  InstructionCost ScalarCost = getCastInstrCost(
854  Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
855  InstructionCost TotCost = VF * ScalarCost;
856  bool NeedsInserts = true, NeedsExtracts = true;
857  // FP128 registers do not get inserted or extracted.
858  if (DstScalarBits == 128 &&
859  (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
860  NeedsInserts = false;
861  if (SrcScalarBits == 128 &&
862  (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
863  NeedsExtracts = false;
864 
865  TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
866  TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
867 
868  // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
869  if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
870  TotCost *= 2;
871 
872  return TotCost;
873  }
874 
875  if (Opcode == Instruction::FPTrunc) {
876  if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
877  return VF /*ldxbr/lexbr*/ +
878  getScalarizationOverhead(DstVecTy, true, false);
879  else // double -> float
880  return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
881  }
882 
883  if (Opcode == Instruction::FPExt) {
884  if (SrcScalarBits == 32 && DstScalarBits == 64) {
885  // float -> double is very rare and currently unoptimized. Instead of
886  // using vldeb, which can do two at a time, all conversions are
887  // scalarized.
888  return VF * 2;
889  }
890  // -> fp128. VF * lxdb/lxeb + extraction of elements.
891  return VF + getScalarizationOverhead(SrcVecTy, false, true);
892  }
893  }
894 
895  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
896 }
897 
898 // Scalar i8 / i16 operations will typically be made after first extending
899 // the operands to i32.
900 static unsigned getOperandsExtensionCost(const Instruction *I) {
901  unsigned ExtCost = 0;
902  for (Value *Op : I->operands())
903  // A load of i8 or i16 sign/zero extends to i32.
904  if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
905  ExtCost++;
906 
907  return ExtCost;
908 }
909 
911  Type *CondTy,
912  CmpInst::Predicate VecPred,
914  const Instruction *I) {
916  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
917 
918  if (!ValTy->isVectorTy()) {
919  switch (Opcode) {
920  case Instruction::ICmp: {
921  // A loaded value compared with 0 with multiple users becomes Load and
922  // Test. The load is then not foldable, so return 0 cost for the ICmp.
923  unsigned ScalarBits = ValTy->getScalarSizeInBits();
924  if (I != nullptr && ScalarBits >= 32)
925  if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
926  if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
927  if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
928  C->isZero())
929  return 0;
930 
931  unsigned Cost = 1;
932  if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
933  Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
934  return Cost;
935  }
936  case Instruction::Select:
937  if (ValTy->isFloatingPointTy())
938  return 4; // No load on condition for FP - costs a conditional jump.
939  return 1; // Load On Condition / Select Register.
940  }
941  }
942  else if (ST->hasVector()) {
943  unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
944 
945  // Called with a compare instruction.
946  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
947  unsigned PredicateExtraCost = 0;
948  if (I != nullptr) {
949  // Some predicates cost one or two extra instructions.
950  switch (cast<CmpInst>(I)->getPredicate()) {
951  case CmpInst::Predicate::ICMP_NE:
952  case CmpInst::Predicate::ICMP_UGE:
953  case CmpInst::Predicate::ICMP_ULE:
954  case CmpInst::Predicate::ICMP_SGE:
955  case CmpInst::Predicate::ICMP_SLE:
956  PredicateExtraCost = 1;
957  break;
958  case CmpInst::Predicate::FCMP_ONE:
959  case CmpInst::Predicate::FCMP_ORD:
960  case CmpInst::Predicate::FCMP_UEQ:
961  case CmpInst::Predicate::FCMP_UNO:
962  PredicateExtraCost = 2;
963  break;
964  default:
965  break;
966  }
967  }
968 
969  // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
970  // floats. FIXME: <2 x float> generates same code as <4 x float>.
971  unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
972  unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
973 
974  unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
975  return Cost;
976  }
977  else { // Called with a select instruction.
978  assert (Opcode == Instruction::Select);
979 
980  // We can figure out the extra cost of packing / unpacking if the
981  // instruction was passed and the compare instruction is found.
982  unsigned PackCost = 0;
983  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
984  if (CmpOpTy != nullptr)
985  PackCost =
986  getVectorBitmaskConversionCost(CmpOpTy, ValTy);
987 
988  return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
989  }
990  }
991 
992  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
993 }
994 
996  unsigned Index) {
997  // vlvgp will insert two grs into a vector register, so only count half the
998  // number of instructions.
999  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1000  return ((Index % 2 == 0) ? 1 : 0);
1001 
1002  if (Opcode == Instruction::ExtractElement) {
1003  int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1004 
1005  // Give a slight penalty for moving out of vector pipeline to FXU unit.
1006  if (Index == 0 && Val->isIntOrIntVectorTy())
1007  Cost += 1;
1008 
1009  return Cost;
1010  }
1011 
1012  return BaseT::getVectorInstrCost(Opcode, Val, Index);
1013 }
1014 
1015 // Check if a load may be folded as a memory operand in its user.
1016 bool SystemZTTIImpl::
1017 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1018  if (!Ld->hasOneUse())
1019  return false;
1020  FoldedValue = Ld;
1021  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1022  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1023  unsigned TruncBits = 0;
1024  unsigned SExtBits = 0;
1025  unsigned ZExtBits = 0;
1026  if (UserI->hasOneUse()) {
1027  unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1028  if (isa<TruncInst>(UserI))
1029  TruncBits = UserBits;
1030  else if (isa<SExtInst>(UserI))
1031  SExtBits = UserBits;
1032  else if (isa<ZExtInst>(UserI))
1033  ZExtBits = UserBits;
1034  }
1035  if (TruncBits || SExtBits || ZExtBits) {
1036  FoldedValue = UserI;
1037  UserI = cast<Instruction>(*UserI->user_begin());
1038  // Load (single use) -> trunc/extend (single use) -> UserI
1039  }
1040  if ((UserI->getOpcode() == Instruction::Sub ||
1041  UserI->getOpcode() == Instruction::SDiv ||
1042  UserI->getOpcode() == Instruction::UDiv) &&
1043  UserI->getOperand(1) != FoldedValue)
1044  return false; // Not commutative, only RHS foldable.
1045  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1046  // extension was made of the load.
1047  unsigned LoadOrTruncBits =
1048  ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1049  switch (UserI->getOpcode()) {
1050  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1051  case Instruction::Sub:
1052  case Instruction::ICmp:
1053  if (LoadedBits == 32 && ZExtBits == 64)
1054  return true;
1056  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1057  if (UserI->getOpcode() != Instruction::ICmp) {
1058  if (LoadedBits == 16 &&
1059  (SExtBits == 32 ||
1060  (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1061  return true;
1062  if (LoadOrTruncBits == 16)
1063  return true;
1064  }
1066  case Instruction::SDiv:// SE: 32->64
1067  if (LoadedBits == 32 && SExtBits == 64)
1068  return true;
1070  case Instruction::UDiv:
1071  case Instruction::And:
1072  case Instruction::Or:
1073  case Instruction::Xor:
1074  // This also makes sense for float operations, but disabled for now due
1075  // to regressions.
1076  // case Instruction::FCmp:
1077  // case Instruction::FAdd:
1078  // case Instruction::FSub:
1079  // case Instruction::FMul:
1080  // case Instruction::FDiv:
1081 
1082  // All possible extensions of memory checked above.
1083 
1084  // Comparison between memory and immediate.
1085  if (UserI->getOpcode() == Instruction::ICmp)
1086  if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1087  if (CI->getValue().isIntN(16))
1088  return true;
1089  return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1090  break;
1091  }
1092  return false;
1093 }
1094 
1095 static bool isBswapIntrinsicCall(const Value *V) {
1096  if (const Instruction *I = dyn_cast<Instruction>(V))
1097  if (auto *CI = dyn_cast<CallInst>(I))
1098  if (auto *F = CI->getCalledFunction())
1099  if (F->getIntrinsicID() == Intrinsic::bswap)
1100  return true;
1101  return false;
1102 }
1103 
1105  MaybeAlign Alignment,
1106  unsigned AddressSpace,
1108  const Instruction *I) {
1109  assert(!Src->isVoidTy() && "Invalid type");
1110 
1111  // TODO: Handle other cost kinds.
1113  return 1;
1114 
1115  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1116  // Store the load or its truncated or extended value in FoldedValue.
1117  const Instruction *FoldedValue = nullptr;
1118  if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1119  const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1120  assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1121 
1122  // UserI can't fold two loads, so in that case return 0 cost only
1123  // half of the time.
1124  for (unsigned i = 0; i < 2; ++i) {
1125  if (UserI->getOperand(i) == FoldedValue)
1126  continue;
1127 
1128  if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1129  LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1130  if (!OtherLoad &&
1131  (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1132  isa<ZExtInst>(OtherOp)))
1133  OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1134  if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1135  return i == 0; // Both operands foldable.
1136  }
1137  }
1138 
1139  return 0; // Only I is foldable in user.
1140  }
1141  }
1142 
1143  unsigned NumOps =
1144  (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1145 
1146  // Store/Load reversed saves one instruction.
1147  if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1148  I != nullptr) {
1149  if (Opcode == Instruction::Load && I->hasOneUse()) {
1150  const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1151  // In case of load -> bswap -> store, return normal cost for the load.
1152  if (isBswapIntrinsicCall(LdUser) &&
1153  (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1154  return 0;
1155  }
1156  else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1157  const Value *StoredVal = SI->getValueOperand();
1158  if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1159  return 0;
1160  }
1161  }
1162 
1163  if (Src->getScalarSizeInBits() == 128)
1164  // 128 bit scalars are held in a pair of two 64 bit registers.
1165  NumOps *= 2;
1166 
1167  return NumOps;
1168 }
1169 
1170 // The generic implementation of getInterleavedMemoryOpCost() is based on
1171 // adding costs of the memory operations plus all the extracts and inserts
1172 // needed for using / defining the vector operands. The SystemZ version does
1173 // roughly the same but bases the computations on vector permutations
1174 // instead.
1176  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1177  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1178  bool UseMaskForCond, bool UseMaskForGaps) {
1179  if (UseMaskForCond || UseMaskForGaps)
1180  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1181  Alignment, AddressSpace, CostKind,
1182  UseMaskForCond, UseMaskForGaps);
1183  assert(isa<VectorType>(VecTy) &&
1184  "Expect a vector type for interleaved memory op");
1185 
1186  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1187  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1188  unsigned VF = NumElts / Factor;
1189  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1190  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1191  unsigned NumPermutes = 0;
1192 
1193  if (Opcode == Instruction::Load) {
1194  // Loading interleave groups may have gaps, which may mean fewer
1195  // loads. Find out how many vectors will be loaded in total, and in how
1196  // many of them each value will be in.
1197  BitVector UsedInsts(NumVectorMemOps, false);
1198  std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1199  for (unsigned Index : Indices)
1200  for (unsigned Elt = 0; Elt < VF; ++Elt) {
1201  unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1202  UsedInsts.set(Vec);
1203  ValueVecs[Index].set(Vec);
1204  }
1205  NumVectorMemOps = UsedInsts.count();
1206 
1207  for (unsigned Index : Indices) {
1208  // Estimate that each loaded source vector containing this Index
1209  // requires one operation, except that vperm can handle two input
1210  // registers first time for each dst vector.
1211  unsigned NumSrcVecs = ValueVecs[Index].count();
1212  unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1213  assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1214  NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1215  }
1216  } else {
1217  // Estimate the permutes for each stored vector as the smaller of the
1218  // number of elements and the number of source vectors. Subtract one per
1219  // dst vector for vperm (S.A.).
1220  unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1221  unsigned NumDstVecs = NumVectorMemOps;
1222  assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1223  NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1224  }
1225 
1226  // Cost of load/store operations and the permutations needed.
1227  return NumVectorMemOps + NumPermutes;
1228 }
1229 
1231  if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1232  return getNumVectorRegs(RetTy); // VPERM
1233  return -1;
1234 }
1235 
1239  InstructionCost Cost =
1241  if (Cost != -1)
1242  return Cost;
1244 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:594
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::BasicTTIImplBase< SystemZTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:487
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition: TargetTransformInfo.h:420
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:455
llvm::SystemZSubtarget::hasLoadStoreOnCond2
bool hasLoadStoreOnCond2() const
Definition: SystemZSubtarget.h:146
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::SystemZTTIImpl::getBoolVecToIntConversionCost
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:742
llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
llvm::InstructionCost::getValue
Optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:87
getVectorIntrinsicInstrCost
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
Definition: SystemZTargetTransformInfo.cpp:1230
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:471
IntrinsicInst.h
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:430
llvm::Function
Definition: Function.h:60
llvm::SystemZSubtarget::hasVector
bool hasVector() const
Definition: SystemZSubtarget.h:212
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BitVector::set
BitVector & set()
Definition: BitVector.h:344
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:594
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:425
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1490
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:919
llvm::BasicTTIImplBase< SystemZTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:777
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition: TargetTransformInfo.h:422
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:483
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1423
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:537
llvm::BasicTTIImplBase< SystemZTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:859
llvm::TargetTransformInfo::UnrollingPreferences::FullUnrollMaxCount
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
Definition: TargetTransformInfo.h:475
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
Vector
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::SystemZTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: SystemZTargetTransformInfo.cpp:595
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
llvm::SystemZInstrInfo
Definition: SystemZInstrInfo.h:175
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:163
llvm::TargetTransformInfo::UnrollingPreferences::AllowExpensiveTripCount
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
Definition: TargetTransformInfo.h:492
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::SystemZTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: SystemZTargetTransformInfo.cpp:354
llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition: TargetTransformInfo.h:419
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:872
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition: TargetTransformInfo.h:421
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
getCmpOpsType
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
Definition: SystemZTargetTransformInfo.cpp:715
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition: TargetTransformInfo.h:425
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::SystemZTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: SystemZTargetTransformInfo.cpp:995
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:157
llvm::SystemZSubtarget::hasMiscellaneousExtensions3
bool hasMiscellaneousExtensions3() const
Definition: SystemZSubtarget.h:242
getScalarSizeInBits
static unsigned getScalarSizeInBits(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:403
llvm::SystemZTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:758
llvm::BasicTTIImplBase< SystemZTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1139
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:871
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1091
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:495
llvm::BitVector::count
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:155
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::SystemZTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: SystemZTargetTransformInfo.cpp:420
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:623
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1478
llvm::SystemZTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:1237
llvm::BitVector
Definition: BitVector.h:75
isUsedAsMemCpySource
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
Definition: SystemZTargetTransformInfo.cpp:33
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:919
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1091
llvm::SystemZTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: SystemZTargetTransformInfo.cpp:52
llvm::SystemZTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:910
getNumVectorRegs
static unsigned getNumVectorRegs(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:413
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:921
llvm::None
const NoneType None
Definition: None.h:24
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:594
llvm::SystemZSubtarget::hasVectorEnhancements2
bool hasVectorEnhancements2() const
Definition: SystemZSubtarget.h:251
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:709
llvm::SystemZTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: SystemZTargetTransformInfo.cpp:366
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:305
llvm::divideCeil
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:769
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::isInt< 32 >
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:373
llvm::SystemZTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: SystemZTargetTransformInfo.cpp:337
getOperandsExtensionCost
static unsigned getOperandsExtensionCost(const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:900
llvm::TargetTransformInfo::LSRCost
Definition: TargetTransformInfo.h:416
llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition: TargetLowering.h:932
llvm::SystemZTTIImpl::isFoldableLoad
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
Definition: SystemZTargetTransformInfo.cpp:1017
llvm::SystemZTTIImpl::getVectorBitmaskConversionCost
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:691
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:430
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:160
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:898
llvm::SystemZTTIImpl::getVectorTruncCost
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:649
llvm::isUInt< 32 >
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:411
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SystemZTargetTransformInfo.h
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SystemZTTIImpl::isLSRCostLess
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
Definition: SystemZTargetTransformInfo.cpp:342
isBswapIntrinsicCall
static bool isBswapIntrinsicCall(const Value *V)
Definition: SystemZTargetTransformInfo.cpp:1095
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:121
getElSizeLog2Diff
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
Definition: SystemZTargetTransformInfo.cpp:637
llvm::SystemZSubtarget::hasVectorEnhancements1
bool hasVectorEnhancements1() const
Definition: SystemZSubtarget.h:231
llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition: TargetTransformInfo.h:426
llvm::BasicTTIImplBase< SystemZTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1240
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:890
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::M68kBeads::Bits1
@ Bits1
Definition: M68kBaseInfo.h:54
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:261
llvm::SystemZTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:69
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::SystemZTTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:223
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::BasicTTIImplBase< SystemZTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:698
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:466
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:224
llvm::BasicTTIImplBase< SystemZTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:585
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:186
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::isInt< 16 >
constexpr bool isInt< 16 >(int64_t x)
Definition: MathExtras.h:370
llvm::PPC::getPredicate
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
Definition: PPCPredicates.h:87
llvm::SystemZTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: SystemZTargetTransformInfo.cpp:1175
llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition: TargetTransformInfo.h:423
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::SystemZTTIImpl::getMinPrefetchStride
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
Definition: SystemZTargetTransformInfo.cpp:379
llvm::SystemZSubtarget::hasPopulationCount
bool hasPopulationCount() const
Definition: SystemZSubtarget.h:155
llvm::SystemZSubtarget::hasMiscellaneousExtensions2
bool hasMiscellaneousExtensions2() const
Definition: SystemZSubtarget.h:215
CostTable.h
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:326
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:197
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:421
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::SystemZSubtarget::getInstrInfo
const SystemZInstrInfo * getInstrInfo() const override
Definition: SystemZSubtarget.h:111
llvm::SystemZTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:1104
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:919
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:1024
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:919
llvm::BasicTTIImplBase< SystemZTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1380
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::BasicTTIImplBase< SystemZTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp)
Definition: BasicTTIImpl.h:2100
TargetTransformInfo.h
llvm::SystemZTTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: SystemZTargetTransformInfo.cpp:274
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1174
llvm::SystemZTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: SystemZTargetTransformInfo.cpp:102
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:262
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1455
llvm::SystemZTTIImpl::hasDivRemOp
bool hasDivRemOp(Type *DataType, bool IsSigned)
Definition: SystemZTargetTransformInfo.cpp:396
llvm::SystemZTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: SystemZTargetTransformInfo.cpp:281
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:157
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:879
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::BasicTTIImplBase< SystemZTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:892