LLVM  13.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
19 #include "llvm/CodeGen/CostTable.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
35  assert(Ty->isIntegerTy());
36 
37  unsigned BitSize = Ty->getPrimitiveSizeInBits();
38  // There is no cost model for constants with a bit size of 0. Return TCC_Free
39  // here, so that constant hoisting will ignore this constant.
40  if (BitSize == 0)
41  return TTI::TCC_Free;
42  // No cost model for operations on integers larger than 64 bit implemented yet.
43  if (BitSize > 64)
44  return TTI::TCC_Free;
45 
46  if (Imm == 0)
47  return TTI::TCC_Free;
48 
49  if (Imm.getBitWidth() <= 64) {
50  // Constants loaded via lgfi.
51  if (isInt<32>(Imm.getSExtValue()))
52  return TTI::TCC_Basic;
53  // Constants loaded via llilf.
54  if (isUInt<32>(Imm.getZExtValue()))
55  return TTI::TCC_Basic;
56  // Constants loaded via llihf:
57  if ((Imm.getZExtValue() & 0xffffffff) == 0)
58  return TTI::TCC_Basic;
59 
60  return 2 * TTI::TCC_Basic;
61  }
62 
63  return 4 * TTI::TCC_Basic;
64 }
65 
66 int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
67  const APInt &Imm, Type *Ty,
69  Instruction *Inst) {
70  assert(Ty->isIntegerTy());
71 
72  unsigned BitSize = Ty->getPrimitiveSizeInBits();
73  // There is no cost model for constants with a bit size of 0. Return TCC_Free
74  // here, so that constant hoisting will ignore this constant.
75  if (BitSize == 0)
76  return TTI::TCC_Free;
77  // No cost model for operations on integers larger than 64 bit implemented yet.
78  if (BitSize > 64)
79  return TTI::TCC_Free;
80 
81  switch (Opcode) {
82  default:
83  return TTI::TCC_Free;
84  case Instruction::GetElementPtr:
85  // Always hoist the base address of a GetElementPtr. This prevents the
86  // creation of new constants for every base constant that gets constant
87  // folded with the offset.
88  if (Idx == 0)
89  return 2 * TTI::TCC_Basic;
90  return TTI::TCC_Free;
91  case Instruction::Store:
92  if (Idx == 0 && Imm.getBitWidth() <= 64) {
93  // Any 8-bit immediate store can by implemented via mvi.
94  if (BitSize == 8)
95  return TTI::TCC_Free;
96  // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
97  if (isInt<16>(Imm.getSExtValue()))
98  return TTI::TCC_Free;
99  }
100  break;
101  case Instruction::ICmp:
102  if (Idx == 1 && Imm.getBitWidth() <= 64) {
103  // Comparisons against signed 32-bit immediates implemented via cgfi.
104  if (isInt<32>(Imm.getSExtValue()))
105  return TTI::TCC_Free;
106  // Comparisons against unsigned 32-bit immediates implemented via clgfi.
107  if (isUInt<32>(Imm.getZExtValue()))
108  return TTI::TCC_Free;
109  }
110  break;
111  case Instruction::Add:
112  case Instruction::Sub:
113  if (Idx == 1 && Imm.getBitWidth() <= 64) {
114  // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
115  if (isUInt<32>(Imm.getZExtValue()))
116  return TTI::TCC_Free;
117  // Or their negation, by swapping addition vs. subtraction.
118  if (isUInt<32>(-Imm.getSExtValue()))
119  return TTI::TCC_Free;
120  }
121  break;
122  case Instruction::Mul:
123  if (Idx == 1 && Imm.getBitWidth() <= 64) {
124  // We use msgfi to multiply by 32-bit signed immediates.
125  if (isInt<32>(Imm.getSExtValue()))
126  return TTI::TCC_Free;
127  }
128  break;
129  case Instruction::Or:
130  case Instruction::Xor:
131  if (Idx == 1 && Imm.getBitWidth() <= 64) {
132  // Masks supported by oilf/xilf.
133  if (isUInt<32>(Imm.getZExtValue()))
134  return TTI::TCC_Free;
135  // Masks supported by oihf/xihf.
136  if ((Imm.getZExtValue() & 0xffffffff) == 0)
137  return TTI::TCC_Free;
138  }
139  break;
140  case Instruction::And:
141  if (Idx == 1 && Imm.getBitWidth() <= 64) {
142  // Any 32-bit AND operation can by implemented via nilf.
143  if (BitSize <= 32)
144  return TTI::TCC_Free;
145  // 64-bit masks supported by nilf.
146  if (isUInt<32>(~Imm.getZExtValue()))
147  return TTI::TCC_Free;
148  // 64-bit masks supported by nilh.
149  if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
150  return TTI::TCC_Free;
151  // Some 64-bit AND operations can be implemented via risbg.
152  const SystemZInstrInfo *TII = ST->getInstrInfo();
153  unsigned Start, End;
154  if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
155  return TTI::TCC_Free;
156  }
157  break;
158  case Instruction::Shl:
159  case Instruction::LShr:
160  case Instruction::AShr:
161  // Always return TCC_Free for the shift value of a shift instruction.
162  if (Idx == 1)
163  return TTI::TCC_Free;
164  break;
165  case Instruction::UDiv:
166  case Instruction::SDiv:
167  case Instruction::URem:
168  case Instruction::SRem:
169  case Instruction::Trunc:
170  case Instruction::ZExt:
171  case Instruction::SExt:
172  case Instruction::IntToPtr:
173  case Instruction::PtrToInt:
174  case Instruction::BitCast:
175  case Instruction::PHI:
176  case Instruction::Call:
177  case Instruction::Select:
178  case Instruction::Ret:
179  case Instruction::Load:
180  break;
181  }
182 
183  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
184 }
185 
187  const APInt &Imm, Type *Ty,
189  assert(Ty->isIntegerTy());
190 
191  unsigned BitSize = Ty->getPrimitiveSizeInBits();
192  // There is no cost model for constants with a bit size of 0. Return TCC_Free
193  // here, so that constant hoisting will ignore this constant.
194  if (BitSize == 0)
195  return TTI::TCC_Free;
196  // No cost model for operations on integers larger than 64 bit implemented yet.
197  if (BitSize > 64)
198  return TTI::TCC_Free;
199 
200  switch (IID) {
201  default:
202  return TTI::TCC_Free;
203  case Intrinsic::sadd_with_overflow:
204  case Intrinsic::uadd_with_overflow:
205  case Intrinsic::ssub_with_overflow:
206  case Intrinsic::usub_with_overflow:
207  // These get expanded to include a normal addition/subtraction.
208  if (Idx == 1 && Imm.getBitWidth() <= 64) {
209  if (isUInt<32>(Imm.getZExtValue()))
210  return TTI::TCC_Free;
211  if (isUInt<32>(-Imm.getSExtValue()))
212  return TTI::TCC_Free;
213  }
214  break;
215  case Intrinsic::smul_with_overflow:
216  case Intrinsic::umul_with_overflow:
217  // These get expanded to include a normal multiplication.
218  if (Idx == 1 && Imm.getBitWidth() <= 64) {
219  if (isInt<32>(Imm.getSExtValue()))
220  return TTI::TCC_Free;
221  }
222  break;
223  case Intrinsic::experimental_stackmap:
224  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
225  return TTI::TCC_Free;
226  break;
227  case Intrinsic::experimental_patchpoint_void:
228  case Intrinsic::experimental_patchpoint_i64:
229  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
230  return TTI::TCC_Free;
231  break;
232  }
233  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
234 }
235 
238  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
239  if (ST->hasPopulationCount() && TyWidth <= 64)
240  return TTI::PSK_FastHardware;
241  return TTI::PSK_Software;
242 }
243 
246  // Find out if L contains a call, what the machine instruction count
247  // estimate is, and how many stores there are.
248  bool HasCall = false;
249  InstructionCost NumStores = 0;
250  for (auto &BB : L->blocks())
251  for (auto &I : *BB) {
252  if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
253  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
254  if (isLoweredToCall(F))
255  HasCall = true;
256  if (F->getIntrinsicID() == Intrinsic::memcpy ||
257  F->getIntrinsicID() == Intrinsic::memset)
258  NumStores++;
259  } else { // indirect call.
260  HasCall = true;
261  }
262  }
263  if (isa<StoreInst>(&I)) {
264  Type *MemAccessTy = I.getOperand(0)->getType();
265  NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
267  }
268  }
269 
270  // The z13 processor will run out of store tags if too many stores
271  // are fed into it too quickly. Therefore make sure there are not
272  // too many stores in the resulting unrolled loop.
273  unsigned const NumStoresVal = *NumStores.getValue();
274  unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
275 
276  if (HasCall) {
277  // Only allow full unrolling if loop has any calls.
278  UP.FullUnrollMaxCount = Max;
279  UP.MaxCount = 1;
280  return;
281  }
282 
283  UP.MaxCount = Max;
284  if (UP.MaxCount <= 1)
285  return;
286 
287  // Allow partial and runtime trip count unrolling.
288  UP.Partial = UP.Runtime = true;
289 
290  UP.PartialThreshold = 75;
292 
293  // Allow expensive instructions in the pre-header of the loop.
294  UP.AllowExpensiveTripCount = true;
295 
296  UP.Force = true;
297 }
298 
301  BaseT::getPeelingPreferences(L, SE, PP);
302 }
303 
306  // SystemZ specific: check instruction count (first), and don't care about
307  // ImmCost, since offsets are checked explicitly.
308  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
309  C1.NumIVMuls, C1.NumBaseAdds,
310  C1.ScaleCost, C1.SetupCost) <
311  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
312  C2.NumIVMuls, C2.NumBaseAdds,
313  C2.ScaleCost, C2.SetupCost);
314 }
315 
316 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
317  bool Vector = (ClassID == 1);
318  if (!Vector)
319  // Discount the stack pointer. Also leave out %r0, since it can't
320  // be used in an address.
321  return 14;
322  if (ST->hasVector())
323  return 32;
324  return 0;
325 }
326 
327 TypeSize
329  switch (K) {
331  return TypeSize::getFixed(64);
333  return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
335  return TypeSize::getScalable(0);
336  }
337 
338  llvm_unreachable("Unsupported register kind");
339 }
340 
341 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
342  unsigned NumStridedMemAccesses,
343  unsigned NumPrefetches,
344  bool HasCall) const {
345  // Don't prefetch a loop with many far apart accesses.
346  if (NumPrefetches > 16)
347  return UINT_MAX;
348 
349  // Emit prefetch instructions for smaller strides in cases where we think
350  // the hardware prefetcher might not be able to keep up.
351  if (NumStridedMemAccesses > 32 && !HasCall &&
352  (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
353  return 1;
354 
355  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
356 }
357 
358 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
359  EVT VT = TLI->getValueType(DL, DataType);
360  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
361 }
362 
363 // Return the bit size for the scalar type or vector element
364 // type. getScalarSizeInBits() returns 0 for a pointer type.
365 static unsigned getScalarSizeInBits(Type *Ty) {
366  unsigned Size =
367  (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
368  assert(Size > 0 && "Element must have non-zero size.");
369  return Size;
370 }
371 
372 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
373 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
374 // 3.
375 static unsigned getNumVectorRegs(Type *Ty) {
376  auto *VTy = cast<FixedVectorType>(Ty);
377  unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
378  assert(WideBits > 0 && "Could not compute size of vector");
379  return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
380 }
381 
383  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
385  TTI::OperandValueProperties Opd1PropInfo,
387  const Instruction *CxtI) {
388 
389  // TODO: Handle more cost kinds.
391  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
392  Op2Info, Opd1PropInfo,
393  Opd2PropInfo, Args, CxtI);
394 
395  // TODO: return a good value for BB-VECTORIZER that includes the
396  // immediate loads, which we do not want to count for the loop
397  // vectorizer, since they are hopefully hoisted out of the loop. This
398  // would require a new parameter 'InLoop', but not sure if constant
399  // args are common enough to motivate this.
400 
401  unsigned ScalarBits = Ty->getScalarSizeInBits();
402 
403  // There are thre cases of division and remainder: Dividing with a register
404  // needs a divide instruction. A divisor which is a power of two constant
405  // can be implemented with a sequence of shifts. Any other constant needs a
406  // multiply and shifts.
407  const unsigned DivInstrCost = 20;
408  const unsigned DivMulSeqCost = 10;
409  const unsigned SDivPow2Cost = 4;
410 
411  bool SignedDivRem =
412  Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
413  bool UnsignedDivRem =
414  Opcode == Instruction::UDiv || Opcode == Instruction::URem;
415 
416  // Check for a constant divisor.
417  bool DivRemConst = false;
418  bool DivRemConstPow2 = false;
419  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
420  if (const Constant *C = dyn_cast<Constant>(Args[1])) {
421  const ConstantInt *CVal =
422  (C->getType()->isVectorTy()
423  ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
424  : dyn_cast<const ConstantInt>(C));
425  if (CVal != nullptr &&
426  (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
427  DivRemConstPow2 = true;
428  else
429  DivRemConst = true;
430  }
431  }
432 
433  if (!Ty->isVectorTy()) {
434  // These FP operations are supported with a dedicated instruction for
435  // float, double and fp128 (base implementation assumes float generally
436  // costs 2).
437  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
438  Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
439  return 1;
440 
441  // There is no native support for FRem.
442  if (Opcode == Instruction::FRem)
443  return LIBCALL_COST;
444 
445  // Give discount for some combined logical operations if supported.
446  if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
447  if (Opcode == Instruction::Xor) {
448  for (const Value *A : Args) {
449  if (const Instruction *I = dyn_cast<Instruction>(A))
450  if (I->hasOneUse() &&
451  (I->getOpcode() == Instruction::And ||
452  I->getOpcode() == Instruction::Or ||
453  I->getOpcode() == Instruction::Xor))
454  return 0;
455  }
456  }
457  else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
458  for (const Value *A : Args) {
459  if (const Instruction *I = dyn_cast<Instruction>(A))
460  if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
461  return 0;
462  }
463  }
464  }
465 
466  // Or requires one instruction, although it has custom handling for i64.
467  if (Opcode == Instruction::Or)
468  return 1;
469 
470  if (Opcode == Instruction::Xor && ScalarBits == 1) {
471  if (ST->hasLoadStoreOnCond2())
472  return 5; // 2 * (li 0; loc 1); xor
473  return 7; // 2 * ipm sequences ; xor ; shift ; compare
474  }
475 
476  if (DivRemConstPow2)
477  return (SignedDivRem ? SDivPow2Cost : 1);
478  if (DivRemConst)
479  return DivMulSeqCost;
480  if (SignedDivRem || UnsignedDivRem)
481  return DivInstrCost;
482  }
483  else if (ST->hasVector()) {
484  auto *VTy = cast<FixedVectorType>(Ty);
485  unsigned VF = VTy->getNumElements();
486  unsigned NumVectors = getNumVectorRegs(Ty);
487 
488  // These vector operations are custom handled, but are still supported
489  // with one instruction per vector, regardless of element size.
490  if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
491  Opcode == Instruction::AShr) {
492  return NumVectors;
493  }
494 
495  if (DivRemConstPow2)
496  return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
497  if (DivRemConst) {
498  SmallVector<Type *> Tys(Args.size(), Ty);
499  return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
500  }
501  if ((SignedDivRem || UnsignedDivRem) && VF > 4)
502  // Temporary hack: disable high vectorization factors with integer
503  // division/remainder, which will get scalarized and handled with
504  // GR128 registers. The mischeduler is not clever enough to avoid
505  // spilling yet.
506  return 1000;
507 
508  // These FP operations are supported with a single vector instruction for
509  // double (base implementation assumes float generally costs 2). For
510  // FP128, the scalar cost is 1, and there is no overhead since the values
511  // are already in scalar registers.
512  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
513  Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
514  switch (ScalarBits) {
515  case 32: {
516  // The vector enhancements facility 1 provides v4f32 instructions.
517  if (ST->hasVectorEnhancements1())
518  return NumVectors;
519  // Return the cost of multiple scalar invocation plus the cost of
520  // inserting and extracting the values.
521  InstructionCost ScalarCost =
523  SmallVector<Type *> Tys(Args.size(), Ty);
524  InstructionCost Cost =
525  (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
526  // FIXME: VF 2 for these FP operations are currently just as
527  // expensive as for VF 4.
528  if (VF == 2)
529  Cost *= 2;
530  return Cost;
531  }
532  case 64:
533  case 128:
534  return NumVectors;
535  default:
536  break;
537  }
538  }
539 
540  // There is no native support for FRem.
541  if (Opcode == Instruction::FRem) {
542  SmallVector<Type *> Tys(Args.size(), Ty);
543  unsigned Cost =
544  (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
545  // FIXME: VF 2 for float is currently just as expensive as for VF 4.
546  if (VF == 2 && ScalarBits == 32)
547  Cost *= 2;
548  return Cost;
549  }
550  }
551 
552  // Fallback to the default implementation.
553  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
554  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
555 }
556 
558  VectorType *Tp,
559  ArrayRef<int> Mask, int Index,
560  VectorType *SubTp) {
561  if (ST->hasVector()) {
562  unsigned NumVectors = getNumVectorRegs(Tp);
563 
564  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
565 
566  // FP128 values are always in scalar registers, so there is no work
567  // involved with a shuffle, except for broadcast. In that case register
568  // moves are done with a single instruction per element.
569  if (Tp->getScalarType()->isFP128Ty())
570  return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
571 
572  switch (Kind) {
574  // ExtractSubvector Index indicates start offset.
575 
576  // Extracting a subvector from first index is a noop.
577  return (Index == 0 ? 0 : NumVectors);
578 
580  // Loop vectorizer calls here to figure out the extra cost of
581  // broadcasting a loaded value to all elements of a vector. Since vlrep
582  // loads and replicates with a single instruction, adjust the returned
583  // value.
584  return NumVectors - 1;
585 
586  default:
587 
588  // SystemZ supports single instruction permutation / replication.
589  return NumVectors;
590  }
591  }
592 
593  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
594 }
595 
596 // Return the log2 difference of the element sizes of the two vector types.
597 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
598  unsigned Bits0 = Ty0->getScalarSizeInBits();
599  unsigned Bits1 = Ty1->getScalarSizeInBits();
600 
601  if (Bits1 > Bits0)
602  return (Log2_32(Bits1) - Log2_32(Bits0));
603 
604  return (Log2_32(Bits0) - Log2_32(Bits1));
605 }
606 
607 // Return the number of instructions needed to truncate SrcTy to DstTy.
608 unsigned SystemZTTIImpl::
609 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
610  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
613  "Packing must reduce size of vector type.");
614  assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
615  cast<FixedVectorType>(DstTy)->getNumElements() &&
616  "Packing should not change number of elements.");
617 
618  // TODO: Since fp32 is expanded, the extract cost should always be 0.
619 
620  unsigned NumParts = getNumVectorRegs(SrcTy);
621  if (NumParts <= 2)
622  // Up to 2 vector registers can be truncated efficiently with pack or
623  // permute. The latter requires an immediate mask to be loaded, which
624  // typically gets hoisted out of a loop. TODO: return a good value for
625  // BB-VECTORIZER that includes the immediate loads, which we do not want
626  // to count for the loop vectorizer.
627  return 1;
628 
629  unsigned Cost = 0;
630  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
631  unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
632  for (unsigned P = 0; P < Log2Diff; ++P) {
633  if (NumParts > 1)
634  NumParts /= 2;
635  Cost += NumParts;
636  }
637 
638  // Currently, a general mix of permutes and pack instructions is output by
639  // isel, which follow the cost computation above except for this case which
640  // is one instruction less:
641  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
642  DstTy->getScalarSizeInBits() == 8)
643  Cost--;
644 
645  return Cost;
646 }
647 
648 // Return the cost of converting a vector bitmask produced by a compare
649 // (SrcTy), to the type of the select or extend instruction (DstTy).
650 unsigned SystemZTTIImpl::
652  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
653  "Should only be called with vector types.");
654 
655  unsigned PackCost = 0;
656  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
657  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
658  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
659  if (SrcScalarBits > DstScalarBits)
660  // The bitmask will be truncated.
661  PackCost = getVectorTruncCost(SrcTy, DstTy);
662  else if (SrcScalarBits < DstScalarBits) {
663  unsigned DstNumParts = getNumVectorRegs(DstTy);
664  // Each vector select needs its part of the bitmask unpacked.
665  PackCost = Log2Diff * DstNumParts;
666  // Extra cost for moving part of mask before unpacking.
667  PackCost += DstNumParts - 1;
668  }
669 
670  return PackCost;
671 }
672 
673 // Return the type of the compared operands. This is needed to compute the
674 // cost for a Select / ZExt or SExt instruction.
675 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
676  Type *OpTy = nullptr;
677  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
678  OpTy = CI->getOperand(0)->getType();
679  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
680  if (LogicI->getNumOperands() == 2)
681  if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
682  if (isa<CmpInst>(LogicI->getOperand(1)))
683  OpTy = CI0->getOperand(0)->getType();
684 
685  if (OpTy != nullptr) {
686  if (VF == 1) {
687  assert (!OpTy->isVectorTy() && "Expected scalar type");
688  return OpTy;
689  }
690  // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
691  // be either scalar or already vectorized with a same or lesser VF.
692  Type *ElTy = OpTy->getScalarType();
693  return FixedVectorType::get(ElTy, VF);
694  }
695 
696  return nullptr;
697 }
698 
699 // Get the cost of converting a boolean vector to a vector with same width
700 // and element size as Dst, plus the cost of zero extending if needed.
701 unsigned SystemZTTIImpl::
702 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
703  const Instruction *I) {
704  auto *DstVTy = cast<FixedVectorType>(Dst);
705  unsigned VF = DstVTy->getNumElements();
706  unsigned Cost = 0;
707  // If we know what the widths of the compared operands, get any cost of
708  // converting it to match Dst. Otherwise assume same widths.
709  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
710  if (CmpOpTy != nullptr)
711  Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
712  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
713  // One 'vn' per dst vector with an immediate mask.
714  Cost += getNumVectorRegs(Dst);
715  return Cost;
716 }
717 
719  Type *Src,
722  const Instruction *I) {
723  // FIXME: Can the logic below also be used for these cost kinds?
725  auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
726  return BaseCost == 0 ? BaseCost : 1;
727  }
728 
729  unsigned DstScalarBits = Dst->getScalarSizeInBits();
730  unsigned SrcScalarBits = Src->getScalarSizeInBits();
731 
732  if (!Src->isVectorTy()) {
733  assert (!Dst->isVectorTy());
734 
735  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
736  if (SrcScalarBits >= 32 ||
737  (I != nullptr && isa<LoadInst>(I->getOperand(0))))
738  return 1;
739  return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
740  }
741 
742  if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
743  Src->isIntegerTy(1)) {
744  if (ST->hasLoadStoreOnCond2())
745  return 2; // li 0; loc 1
746 
747  // This should be extension of a compare i1 result, which is done with
748  // ipm and a varying sequence of instructions.
749  unsigned Cost = 0;
750  if (Opcode == Instruction::SExt)
751  Cost = (DstScalarBits < 64 ? 3 : 4);
752  if (Opcode == Instruction::ZExt)
753  Cost = 3;
754  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
755  if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
756  // If operands of an fp-type was compared, this costs +1.
757  Cost++;
758  return Cost;
759  }
760  }
761  else if (ST->hasVector()) {
762  // Vector to scalar cast.
763  auto *SrcVecTy = cast<FixedVectorType>(Src);
764  auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
765  if (!DstVecTy) {
766  // TODO: tune vector-to-scalar cast.
767  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
768  }
769  unsigned VF = SrcVecTy->getNumElements();
770  unsigned NumDstVectors = getNumVectorRegs(Dst);
771  unsigned NumSrcVectors = getNumVectorRegs(Src);
772 
773  if (Opcode == Instruction::Trunc) {
774  if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
775  return 0; // Check for NOOP conversions.
776  return getVectorTruncCost(Src, Dst);
777  }
778 
779  if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
780  if (SrcScalarBits >= 8) {
781  // ZExt/SExt will be handled with one unpack per doubling of width.
782  unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
783 
784  // For types that spans multiple vector registers, some additional
785  // instructions are used to setup the unpacking.
786  unsigned NumSrcVectorOps =
787  (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
788  : (NumDstVectors / 2));
789 
790  return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
791  }
792  else if (SrcScalarBits == 1)
793  return getBoolVecToIntConversionCost(Opcode, Dst, I);
794  }
795 
796  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
797  Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
798  // TODO: Fix base implementation which could simplify things a bit here
799  // (seems to miss on differentiating on scalar/vector types).
800 
801  // Only 64 bit vector conversions are natively supported before z15.
802  if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
803  if (SrcScalarBits == DstScalarBits)
804  return NumDstVectors;
805 
806  if (SrcScalarBits == 1)
807  return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
808  }
809 
810  // Return the cost of multiple scalar invocation plus the cost of
811  // inserting and extracting the values. Base implementation does not
812  // realize float->int gets scalarized.
813  InstructionCost ScalarCost = getCastInstrCost(
814  Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
815  InstructionCost TotCost = VF * ScalarCost;
816  bool NeedsInserts = true, NeedsExtracts = true;
817  // FP128 registers do not get inserted or extracted.
818  if (DstScalarBits == 128 &&
819  (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
820  NeedsInserts = false;
821  if (SrcScalarBits == 128 &&
822  (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
823  NeedsExtracts = false;
824 
825  TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
826  TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
827 
828  // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
829  if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
830  TotCost *= 2;
831 
832  return TotCost;
833  }
834 
835  if (Opcode == Instruction::FPTrunc) {
836  if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
837  return VF /*ldxbr/lexbr*/ +
838  getScalarizationOverhead(DstVecTy, true, false);
839  else // double -> float
840  return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
841  }
842 
843  if (Opcode == Instruction::FPExt) {
844  if (SrcScalarBits == 32 && DstScalarBits == 64) {
845  // float -> double is very rare and currently unoptimized. Instead of
846  // using vldeb, which can do two at a time, all conversions are
847  // scalarized.
848  return VF * 2;
849  }
850  // -> fp128. VF * lxdb/lxeb + extraction of elements.
851  return VF + getScalarizationOverhead(SrcVecTy, false, true);
852  }
853  }
854 
855  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
856 }
857 
858 // Scalar i8 / i16 operations will typically be made after first extending
859 // the operands to i32.
860 static unsigned getOperandsExtensionCost(const Instruction *I) {
861  unsigned ExtCost = 0;
862  for (Value *Op : I->operands())
863  // A load of i8 or i16 sign/zero extends to i32.
864  if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
865  ExtCost++;
866 
867  return ExtCost;
868 }
869 
871  Type *CondTy,
872  CmpInst::Predicate VecPred,
874  const Instruction *I) {
876  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
877 
878  if (!ValTy->isVectorTy()) {
879  switch (Opcode) {
880  case Instruction::ICmp: {
881  // A loaded value compared with 0 with multiple users becomes Load and
882  // Test. The load is then not foldable, so return 0 cost for the ICmp.
883  unsigned ScalarBits = ValTy->getScalarSizeInBits();
884  if (I != nullptr && ScalarBits >= 32)
885  if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
886  if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
887  if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
888  C->isZero())
889  return 0;
890 
891  unsigned Cost = 1;
892  if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
893  Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
894  return Cost;
895  }
896  case Instruction::Select:
897  if (ValTy->isFloatingPointTy())
898  return 4; // No load on condition for FP - costs a conditional jump.
899  return 1; // Load On Condition / Select Register.
900  }
901  }
902  else if (ST->hasVector()) {
903  unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
904 
905  // Called with a compare instruction.
906  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
907  unsigned PredicateExtraCost = 0;
908  if (I != nullptr) {
909  // Some predicates cost one or two extra instructions.
910  switch (cast<CmpInst>(I)->getPredicate()) {
911  case CmpInst::Predicate::ICMP_NE:
912  case CmpInst::Predicate::ICMP_UGE:
913  case CmpInst::Predicate::ICMP_ULE:
914  case CmpInst::Predicate::ICMP_SGE:
915  case CmpInst::Predicate::ICMP_SLE:
916  PredicateExtraCost = 1;
917  break;
918  case CmpInst::Predicate::FCMP_ONE:
919  case CmpInst::Predicate::FCMP_ORD:
920  case CmpInst::Predicate::FCMP_UEQ:
921  case CmpInst::Predicate::FCMP_UNO:
922  PredicateExtraCost = 2;
923  break;
924  default:
925  break;
926  }
927  }
928 
929  // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
930  // floats. FIXME: <2 x float> generates same code as <4 x float>.
931  unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
932  unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
933 
934  unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
935  return Cost;
936  }
937  else { // Called with a select instruction.
938  assert (Opcode == Instruction::Select);
939 
940  // We can figure out the extra cost of packing / unpacking if the
941  // instruction was passed and the compare instruction is found.
942  unsigned PackCost = 0;
943  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
944  if (CmpOpTy != nullptr)
945  PackCost =
946  getVectorBitmaskConversionCost(CmpOpTy, ValTy);
947 
948  return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
949  }
950  }
951 
952  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
953 }
954 
956  unsigned Index) {
957  // vlvgp will insert two grs into a vector register, so only count half the
958  // number of instructions.
959  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
960  return ((Index % 2 == 0) ? 1 : 0);
961 
962  if (Opcode == Instruction::ExtractElement) {
963  int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
964 
965  // Give a slight penalty for moving out of vector pipeline to FXU unit.
966  if (Index == 0 && Val->isIntOrIntVectorTy())
967  Cost += 1;
968 
969  return Cost;
970  }
971 
972  return BaseT::getVectorInstrCost(Opcode, Val, Index);
973 }
974 
975 // Check if a load may be folded as a memory operand in its user.
976 bool SystemZTTIImpl::
977 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
978  if (!Ld->hasOneUse())
979  return false;
980  FoldedValue = Ld;
981  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
982  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
983  unsigned TruncBits = 0;
984  unsigned SExtBits = 0;
985  unsigned ZExtBits = 0;
986  if (UserI->hasOneUse()) {
987  unsigned UserBits = UserI->getType()->getScalarSizeInBits();
988  if (isa<TruncInst>(UserI))
989  TruncBits = UserBits;
990  else if (isa<SExtInst>(UserI))
991  SExtBits = UserBits;
992  else if (isa<ZExtInst>(UserI))
993  ZExtBits = UserBits;
994  }
995  if (TruncBits || SExtBits || ZExtBits) {
996  FoldedValue = UserI;
997  UserI = cast<Instruction>(*UserI->user_begin());
998  // Load (single use) -> trunc/extend (single use) -> UserI
999  }
1000  if ((UserI->getOpcode() == Instruction::Sub ||
1001  UserI->getOpcode() == Instruction::SDiv ||
1002  UserI->getOpcode() == Instruction::UDiv) &&
1003  UserI->getOperand(1) != FoldedValue)
1004  return false; // Not commutative, only RHS foldable.
1005  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1006  // extension was made of the load.
1007  unsigned LoadOrTruncBits =
1008  ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1009  switch (UserI->getOpcode()) {
1010  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1011  case Instruction::Sub:
1012  case Instruction::ICmp:
1013  if (LoadedBits == 32 && ZExtBits == 64)
1014  return true;
1016  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1017  if (UserI->getOpcode() != Instruction::ICmp) {
1018  if (LoadedBits == 16 &&
1019  (SExtBits == 32 ||
1020  (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1021  return true;
1022  if (LoadOrTruncBits == 16)
1023  return true;
1024  }
1026  case Instruction::SDiv:// SE: 32->64
1027  if (LoadedBits == 32 && SExtBits == 64)
1028  return true;
1030  case Instruction::UDiv:
1031  case Instruction::And:
1032  case Instruction::Or:
1033  case Instruction::Xor:
1034  // This also makes sense for float operations, but disabled for now due
1035  // to regressions.
1036  // case Instruction::FCmp:
1037  // case Instruction::FAdd:
1038  // case Instruction::FSub:
1039  // case Instruction::FMul:
1040  // case Instruction::FDiv:
1041 
1042  // All possible extensions of memory checked above.
1043 
1044  // Comparison between memory and immediate.
1045  if (UserI->getOpcode() == Instruction::ICmp)
1046  if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1047  if (CI->getValue().isIntN(16))
1048  return true;
1049  return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1050  break;
1051  }
1052  return false;
1053 }
1054 
1055 static bool isBswapIntrinsicCall(const Value *V) {
1056  if (const Instruction *I = dyn_cast<Instruction>(V))
1057  if (auto *CI = dyn_cast<CallInst>(I))
1058  if (auto *F = CI->getCalledFunction())
1059  if (F->getIntrinsicID() == Intrinsic::bswap)
1060  return true;
1061  return false;
1062 }
1063 
1065  MaybeAlign Alignment,
1066  unsigned AddressSpace,
1068  const Instruction *I) {
1069  assert(!Src->isVoidTy() && "Invalid type");
1070 
1071  // TODO: Handle other cost kinds.
1073  return 1;
1074 
1075  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1076  // Store the load or its truncated or extended value in FoldedValue.
1077  const Instruction *FoldedValue = nullptr;
1078  if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1079  const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1080  assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1081 
1082  // UserI can't fold two loads, so in that case return 0 cost only
1083  // half of the time.
1084  for (unsigned i = 0; i < 2; ++i) {
1085  if (UserI->getOperand(i) == FoldedValue)
1086  continue;
1087 
1088  if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1089  LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1090  if (!OtherLoad &&
1091  (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1092  isa<ZExtInst>(OtherOp)))
1093  OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1094  if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1095  return i == 0; // Both operands foldable.
1096  }
1097  }
1098 
1099  return 0; // Only I is foldable in user.
1100  }
1101  }
1102 
1103  unsigned NumOps =
1104  (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1105 
1106  // Store/Load reversed saves one instruction.
1107  if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1108  I != nullptr) {
1109  if (Opcode == Instruction::Load && I->hasOneUse()) {
1110  const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1111  // In case of load -> bswap -> store, return normal cost for the load.
1112  if (isBswapIntrinsicCall(LdUser) &&
1113  (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1114  return 0;
1115  }
1116  else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1117  const Value *StoredVal = SI->getValueOperand();
1118  if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1119  return 0;
1120  }
1121  }
1122 
1123  if (Src->getScalarSizeInBits() == 128)
1124  // 128 bit scalars are held in a pair of two 64 bit registers.
1125  NumOps *= 2;
1126 
1127  return NumOps;
1128 }
1129 
1130 // The generic implementation of getInterleavedMemoryOpCost() is based on
1131 // adding costs of the memory operations plus all the extracts and inserts
1132 // needed for using / defining the vector operands. The SystemZ version does
1133 // roughly the same but bases the computations on vector permutations
1134 // instead.
1136  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1137  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1138  bool UseMaskForCond, bool UseMaskForGaps) {
1139  if (UseMaskForCond || UseMaskForGaps)
1140  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1141  Alignment, AddressSpace, CostKind,
1142  UseMaskForCond, UseMaskForGaps);
1143  assert(isa<VectorType>(VecTy) &&
1144  "Expect a vector type for interleaved memory op");
1145 
1146  // Return the ceiling of dividing A by B.
1147  auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1148 
1149  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1150  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1151  unsigned VF = NumElts / Factor;
1152  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1153  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1154  unsigned NumPermutes = 0;
1155 
1156  if (Opcode == Instruction::Load) {
1157  // Loading interleave groups may have gaps, which may mean fewer
1158  // loads. Find out how many vectors will be loaded in total, and in how
1159  // many of them each value will be in.
1160  BitVector UsedInsts(NumVectorMemOps, false);
1161  std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1162  for (unsigned Index : Indices)
1163  for (unsigned Elt = 0; Elt < VF; ++Elt) {
1164  unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1165  UsedInsts.set(Vec);
1166  ValueVecs[Index].set(Vec);
1167  }
1168  NumVectorMemOps = UsedInsts.count();
1169 
1170  for (unsigned Index : Indices) {
1171  // Estimate that each loaded source vector containing this Index
1172  // requires one operation, except that vperm can handle two input
1173  // registers first time for each dst vector.
1174  unsigned NumSrcVecs = ValueVecs[Index].count();
1175  unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
1176  assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1177  NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1178  }
1179  } else {
1180  // Estimate the permutes for each stored vector as the smaller of the
1181  // number of elements and the number of source vectors. Subtract one per
1182  // dst vector for vperm (S.A.).
1183  unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1184  unsigned NumDstVecs = NumVectorMemOps;
1185  assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1186  NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1187  }
1188 
1189  // Cost of load/store operations and the permutations needed.
1190  return NumVectorMemOps + NumPermutes;
1191 }
1192 
1194  if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1195  return getNumVectorRegs(RetTy); // VPERM
1196  return -1;
1197 }
1198 
1202  InstructionCost Cost =
1204  if (Cost != -1)
1205  return Cost;
1207 }
llvm::SystemZTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: SystemZTargetTransformInfo.cpp:1135
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::EngineKind::Kind
Kind
Definition: ExecutionEngine.h:524
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:586
llvm::BasicTTIImplBase< SystemZTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:480
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition: TargetTransformInfo.h:413
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:448
llvm::SystemZSubtarget::hasLoadStoreOnCond2
bool hasLoadStoreOnCond2() const
Definition: SystemZSubtarget.h:124
llvm
Definition: AllocatorList.h:23
llvm::SystemZTTIImpl::getBoolVecToIntConversionCost
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:702
llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:144
llvm::InstructionCost::getValue
Optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:68
getVectorIntrinsicInstrCost
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
Definition: SystemZTargetTransformInfo.cpp:1193
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:447
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:464
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
llvm::Function
Definition: Function.h:61
llvm::SystemZSubtarget::hasVector
bool hasVector() const
Definition: SystemZSubtarget.h:190
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BitVector::set
BitVector & set()
Definition: BitVector.h:343
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:586
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:469
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:131
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::BasicTTIImplBase< SystemZTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:683
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:148
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:924
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition: TargetTransformInfo.h:415
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:476
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::TargetTransformInfo::UnrollingPreferences::FullUnrollMaxCount
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
Definition: TargetTransformInfo.h:468
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:410
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::SystemZInstrInfo
Definition: SystemZInstrInfo.h:174
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:163
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::UnrollingPreferences::AllowExpensiveTripCount
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
Definition: TargetTransformInfo.h:485
llvm::SystemZTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: SystemZTargetTransformInfo.cpp:316
llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition: TargetTransformInfo.h:412
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:846
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::SystemZTTIImpl::getIntImmCost
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:33
llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition: TargetTransformInfo.h:414
getCmpOpsType
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
Definition: SystemZTargetTransformInfo.cpp:675
llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition: TargetTransformInfo.h:418
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::SystemZTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: SystemZTargetTransformInfo.cpp:955
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::SystemZSubtarget::hasMiscellaneousExtensions3
bool hasMiscellaneousExtensions3() const
Definition: SystemZSubtarget.h:220
getScalarSizeInBits
static unsigned getScalarSizeInBits(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:365
llvm::SystemZTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:718
llvm::BasicTTIImplBase< SystemZTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:975
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:845
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1079
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:488
llvm::BitVector::count
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:154
SI
@ SI
Definition: SIInstrInfo.cpp:7344
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:596
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::SystemZTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:1200
llvm::BitVector
Definition: BitVector.h:74
llvm::SystemZTTIImpl::getIntImmCostInst
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: SystemZTargetTransformInfo.cpp:66
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:924
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:154
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:928
llvm::SystemZTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:870
getNumVectorRegs
static unsigned getNumVectorRegs(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:375
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:764
llvm::None
const NoneType None
Definition: None.h:23
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:116
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:586
llvm::SystemZSubtarget::hasVectorEnhancements2
bool hasVectorEnhancements2() const
Definition: SystemZSubtarget.h:229
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:117
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:712
llvm::SystemZTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: SystemZTargetTransformInfo.cpp:328
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::SystemZTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: SystemZTargetTransformInfo.cpp:382
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:303
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::isInt< 32 >
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:373
llvm::SystemZTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: SystemZTargetTransformInfo.cpp:299
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
getOperandsExtensionCost
static unsigned getOperandsExtensionCost(const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:860
llvm::TargetTransformInfo::LSRCost
Definition: TargetTransformInfo.h:409
llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition: TargetLowering.h:881
llvm::SystemZTTIImpl::isFoldableLoad
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
Definition: SystemZTargetTransformInfo.cpp:977
llvm::SystemZTTIImpl::getVectorBitmaskConversionCost
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:651
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:423
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:903
llvm::SystemZTTIImpl::getVectorTruncCost
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:609
llvm::isUInt< 32 >
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:411
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::SystemZTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: SystemZTargetTransformInfo.cpp:557
SystemZTargetTransformInfo.h
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::SystemZTTIImpl::isLSRCostLess
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
Definition: SystemZTargetTransformInfo.cpp:304
isBswapIntrinsicCall
static bool isBswapIntrinsicCall(const Value *V)
Definition: SystemZTargetTransformInfo.cpp:1055
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:115
getElSizeLog2Diff
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
Definition: SystemZTargetTransformInfo.cpp:597
llvm::SystemZSubtarget::hasVectorEnhancements1
bool hasVectorEnhancements1() const
Definition: SystemZSubtarget.h:209
llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition: TargetTransformInfo.h:419
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
llvm::BasicTTIImplBase< SystemZTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1072
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:895
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:459
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:232
llvm::BasicTTIImplBase< SystemZTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:491
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:163
llvm::isInt< 16 >
constexpr bool isInt< 16 >(int64_t x)
Definition: MathExtras.h:370
llvm::PPC::getPredicate
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
Definition: PPCPredicates.h:87
llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition: TargetTransformInfo.h:416
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::SystemZTTIImpl::getMinPrefetchStride
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
Definition: SystemZTargetTransformInfo.cpp:341
llvm::SystemZSubtarget::hasPopulationCount
bool hasPopulationCount() const
Definition: SystemZSubtarget.h:133
llvm::SystemZSubtarget::hasMiscellaneousExtensions2
bool hasMiscellaneousExtensions2() const
Definition: SystemZSubtarget.h:193
CostTable.h
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::M68kBeads::Bits1
@ Bits1
Definition: M68kBaseInfo.h:54
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::TypeSize
Definition: TypeSize.h:417
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::SystemZSubtarget::getInstrInfo
const SystemZInstrInfo * getInstrInfo() const override
Definition: SystemZSubtarget.h:89
llvm::SystemZTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:1064
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:184
llvm::SystemZTTIImpl::getIntImmCostIntrin
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:186
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:924
Vector
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:146
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:924
llvm::BasicTTIImplBase< SystemZTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:741
llvm::BasicTTIImplBase< SystemZTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1224
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
llvm::BasicTTIImplBase< SystemZTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp)
Definition: BasicTTIImpl.h:1894
TargetTransformInfo.h
llvm::SystemZTTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: SystemZTargetTransformInfo.cpp:237
llvm::BasicTTIImplBase< SystemZTTIImpl >::getScalarizationOverhead
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:603
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1382
llvm::SystemZTTIImpl::hasDivRemOp
bool hasDivRemOp(Type *DataType, bool IsSigned)
Definition: SystemZTargetTransformInfo.cpp:358
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:157
Debug.h
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:853
llvm::SystemZTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: SystemZTargetTransformInfo.cpp:244
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:122
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38