LLVM  14.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
19 #include "llvm/CodeGen/CostTable.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
35  assert(Ty->isIntegerTy());
36 
37  unsigned BitSize = Ty->getPrimitiveSizeInBits();
38  // There is no cost model for constants with a bit size of 0. Return TCC_Free
39  // here, so that constant hoisting will ignore this constant.
40  if (BitSize == 0)
41  return TTI::TCC_Free;
42  // No cost model for operations on integers larger than 64 bit implemented yet.
43  if (BitSize > 64)
44  return TTI::TCC_Free;
45 
46  if (Imm == 0)
47  return TTI::TCC_Free;
48 
49  if (Imm.getBitWidth() <= 64) {
50  // Constants loaded via lgfi.
51  if (isInt<32>(Imm.getSExtValue()))
52  return TTI::TCC_Basic;
53  // Constants loaded via llilf.
54  if (isUInt<32>(Imm.getZExtValue()))
55  return TTI::TCC_Basic;
56  // Constants loaded via llihf:
57  if ((Imm.getZExtValue() & 0xffffffff) == 0)
58  return TTI::TCC_Basic;
59 
60  return 2 * TTI::TCC_Basic;
61  }
62 
63  return 4 * TTI::TCC_Basic;
64 }
65 
66 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
67  const APInt &Imm, Type *Ty,
69  Instruction *Inst) {
70  assert(Ty->isIntegerTy());
71 
72  unsigned BitSize = Ty->getPrimitiveSizeInBits();
73  // There is no cost model for constants with a bit size of 0. Return TCC_Free
74  // here, so that constant hoisting will ignore this constant.
75  if (BitSize == 0)
76  return TTI::TCC_Free;
77  // No cost model for operations on integers larger than 64 bit implemented yet.
78  if (BitSize > 64)
79  return TTI::TCC_Free;
80 
81  switch (Opcode) {
82  default:
83  return TTI::TCC_Free;
84  case Instruction::GetElementPtr:
85  // Always hoist the base address of a GetElementPtr. This prevents the
86  // creation of new constants for every base constant that gets constant
87  // folded with the offset.
88  if (Idx == 0)
89  return 2 * TTI::TCC_Basic;
90  return TTI::TCC_Free;
91  case Instruction::Store:
92  if (Idx == 0 && Imm.getBitWidth() <= 64) {
93  // Any 8-bit immediate store can by implemented via mvi.
94  if (BitSize == 8)
95  return TTI::TCC_Free;
96  // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
97  if (isInt<16>(Imm.getSExtValue()))
98  return TTI::TCC_Free;
99  }
100  break;
101  case Instruction::ICmp:
102  if (Idx == 1 && Imm.getBitWidth() <= 64) {
103  // Comparisons against signed 32-bit immediates implemented via cgfi.
104  if (isInt<32>(Imm.getSExtValue()))
105  return TTI::TCC_Free;
106  // Comparisons against unsigned 32-bit immediates implemented via clgfi.
107  if (isUInt<32>(Imm.getZExtValue()))
108  return TTI::TCC_Free;
109  }
110  break;
111  case Instruction::Add:
112  case Instruction::Sub:
113  if (Idx == 1 && Imm.getBitWidth() <= 64) {
114  // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
115  if (isUInt<32>(Imm.getZExtValue()))
116  return TTI::TCC_Free;
117  // Or their negation, by swapping addition vs. subtraction.
118  if (isUInt<32>(-Imm.getSExtValue()))
119  return TTI::TCC_Free;
120  }
121  break;
122  case Instruction::Mul:
123  if (Idx == 1 && Imm.getBitWidth() <= 64) {
124  // We use msgfi to multiply by 32-bit signed immediates.
125  if (isInt<32>(Imm.getSExtValue()))
126  return TTI::TCC_Free;
127  }
128  break;
129  case Instruction::Or:
130  case Instruction::Xor:
131  if (Idx == 1 && Imm.getBitWidth() <= 64) {
132  // Masks supported by oilf/xilf.
133  if (isUInt<32>(Imm.getZExtValue()))
134  return TTI::TCC_Free;
135  // Masks supported by oihf/xihf.
136  if ((Imm.getZExtValue() & 0xffffffff) == 0)
137  return TTI::TCC_Free;
138  }
139  break;
140  case Instruction::And:
141  if (Idx == 1 && Imm.getBitWidth() <= 64) {
142  // Any 32-bit AND operation can by implemented via nilf.
143  if (BitSize <= 32)
144  return TTI::TCC_Free;
145  // 64-bit masks supported by nilf.
146  if (isUInt<32>(~Imm.getZExtValue()))
147  return TTI::TCC_Free;
148  // 64-bit masks supported by nilh.
149  if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
150  return TTI::TCC_Free;
151  // Some 64-bit AND operations can be implemented via risbg.
152  const SystemZInstrInfo *TII = ST->getInstrInfo();
153  unsigned Start, End;
154  if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
155  return TTI::TCC_Free;
156  }
157  break;
158  case Instruction::Shl:
159  case Instruction::LShr:
160  case Instruction::AShr:
161  // Always return TCC_Free for the shift value of a shift instruction.
162  if (Idx == 1)
163  return TTI::TCC_Free;
164  break;
165  case Instruction::UDiv:
166  case Instruction::SDiv:
167  case Instruction::URem:
168  case Instruction::SRem:
169  case Instruction::Trunc:
170  case Instruction::ZExt:
171  case Instruction::SExt:
172  case Instruction::IntToPtr:
173  case Instruction::PtrToInt:
174  case Instruction::BitCast:
175  case Instruction::PHI:
176  case Instruction::Call:
177  case Instruction::Select:
178  case Instruction::Ret:
179  case Instruction::Load:
180  break;
181  }
182 
183  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
184 }
185 
188  const APInt &Imm, Type *Ty,
190  assert(Ty->isIntegerTy());
191 
192  unsigned BitSize = Ty->getPrimitiveSizeInBits();
193  // There is no cost model for constants with a bit size of 0. Return TCC_Free
194  // here, so that constant hoisting will ignore this constant.
195  if (BitSize == 0)
196  return TTI::TCC_Free;
197  // No cost model for operations on integers larger than 64 bit implemented yet.
198  if (BitSize > 64)
199  return TTI::TCC_Free;
200 
201  switch (IID) {
202  default:
203  return TTI::TCC_Free;
204  case Intrinsic::sadd_with_overflow:
205  case Intrinsic::uadd_with_overflow:
206  case Intrinsic::ssub_with_overflow:
207  case Intrinsic::usub_with_overflow:
208  // These get expanded to include a normal addition/subtraction.
209  if (Idx == 1 && Imm.getBitWidth() <= 64) {
210  if (isUInt<32>(Imm.getZExtValue()))
211  return TTI::TCC_Free;
212  if (isUInt<32>(-Imm.getSExtValue()))
213  return TTI::TCC_Free;
214  }
215  break;
216  case Intrinsic::smul_with_overflow:
217  case Intrinsic::umul_with_overflow:
218  // These get expanded to include a normal multiplication.
219  if (Idx == 1 && Imm.getBitWidth() <= 64) {
220  if (isInt<32>(Imm.getSExtValue()))
221  return TTI::TCC_Free;
222  }
223  break;
224  case Intrinsic::experimental_stackmap:
225  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
226  return TTI::TCC_Free;
227  break;
228  case Intrinsic::experimental_patchpoint_void:
229  case Intrinsic::experimental_patchpoint_i64:
230  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
231  return TTI::TCC_Free;
232  break;
233  }
234  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
235 }
236 
239  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
240  if (ST->hasPopulationCount() && TyWidth <= 64)
241  return TTI::PSK_FastHardware;
242  return TTI::PSK_Software;
243 }
244 
248  // Find out if L contains a call, what the machine instruction count
249  // estimate is, and how many stores there are.
250  bool HasCall = false;
251  InstructionCost NumStores = 0;
252  for (auto &BB : L->blocks())
253  for (auto &I : *BB) {
254  if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
255  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
256  if (isLoweredToCall(F))
257  HasCall = true;
258  if (F->getIntrinsicID() == Intrinsic::memcpy ||
259  F->getIntrinsicID() == Intrinsic::memset)
260  NumStores++;
261  } else { // indirect call.
262  HasCall = true;
263  }
264  }
265  if (isa<StoreInst>(&I)) {
266  Type *MemAccessTy = I.getOperand(0)->getType();
267  NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
269  }
270  }
271 
272  // The z13 processor will run out of store tags if too many stores
273  // are fed into it too quickly. Therefore make sure there are not
274  // too many stores in the resulting unrolled loop.
275  unsigned const NumStoresVal = *NumStores.getValue();
276  unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
277 
278  if (HasCall) {
279  // Only allow full unrolling if loop has any calls.
280  UP.FullUnrollMaxCount = Max;
281  UP.MaxCount = 1;
282  return;
283  }
284 
285  UP.MaxCount = Max;
286  if (UP.MaxCount <= 1)
287  return;
288 
289  // Allow partial and runtime trip count unrolling.
290  UP.Partial = UP.Runtime = true;
291 
292  UP.PartialThreshold = 75;
294 
295  // Allow expensive instructions in the pre-header of the loop.
296  UP.AllowExpensiveTripCount = true;
297 
298  UP.Force = true;
299 }
300 
303  BaseT::getPeelingPreferences(L, SE, PP);
304 }
305 
308  // SystemZ specific: check instruction count (first), and don't care about
309  // ImmCost, since offsets are checked explicitly.
310  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
311  C1.NumIVMuls, C1.NumBaseAdds,
312  C1.ScaleCost, C1.SetupCost) <
313  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
314  C2.NumIVMuls, C2.NumBaseAdds,
315  C2.ScaleCost, C2.SetupCost);
316 }
317 
318 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
319  bool Vector = (ClassID == 1);
320  if (!Vector)
321  // Discount the stack pointer. Also leave out %r0, since it can't
322  // be used in an address.
323  return 14;
324  if (ST->hasVector())
325  return 32;
326  return 0;
327 }
328 
329 TypeSize
331  switch (K) {
333  return TypeSize::getFixed(64);
335  return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
337  return TypeSize::getScalable(0);
338  }
339 
340  llvm_unreachable("Unsupported register kind");
341 }
342 
343 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
344  unsigned NumStridedMemAccesses,
345  unsigned NumPrefetches,
346  bool HasCall) const {
347  // Don't prefetch a loop with many far apart accesses.
348  if (NumPrefetches > 16)
349  return UINT_MAX;
350 
351  // Emit prefetch instructions for smaller strides in cases where we think
352  // the hardware prefetcher might not be able to keep up.
353  if (NumStridedMemAccesses > 32 && !HasCall &&
354  (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
355  return 1;
356 
357  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
358 }
359 
360 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
361  EVT VT = TLI->getValueType(DL, DataType);
362  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
363 }
364 
365 // Return the bit size for the scalar type or vector element
366 // type. getScalarSizeInBits() returns 0 for a pointer type.
367 static unsigned getScalarSizeInBits(Type *Ty) {
368  unsigned Size =
369  (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
370  assert(Size > 0 && "Element must have non-zero size.");
371  return Size;
372 }
373 
374 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
375 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
376 // 3.
377 static unsigned getNumVectorRegs(Type *Ty) {
378  auto *VTy = cast<FixedVectorType>(Ty);
379  unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
380  assert(WideBits > 0 && "Could not compute size of vector");
381  return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
382 }
383 
385  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
387  TTI::OperandValueProperties Opd1PropInfo,
389  const Instruction *CxtI) {
390 
391  // TODO: Handle more cost kinds.
393  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
394  Op2Info, Opd1PropInfo,
395  Opd2PropInfo, Args, CxtI);
396 
397  // TODO: return a good value for BB-VECTORIZER that includes the
398  // immediate loads, which we do not want to count for the loop
399  // vectorizer, since they are hopefully hoisted out of the loop. This
400  // would require a new parameter 'InLoop', but not sure if constant
401  // args are common enough to motivate this.
402 
403  unsigned ScalarBits = Ty->getScalarSizeInBits();
404 
405  // There are thre cases of division and remainder: Dividing with a register
406  // needs a divide instruction. A divisor which is a power of two constant
407  // can be implemented with a sequence of shifts. Any other constant needs a
408  // multiply and shifts.
409  const unsigned DivInstrCost = 20;
410  const unsigned DivMulSeqCost = 10;
411  const unsigned SDivPow2Cost = 4;
412 
413  bool SignedDivRem =
414  Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
415  bool UnsignedDivRem =
416  Opcode == Instruction::UDiv || Opcode == Instruction::URem;
417 
418  // Check for a constant divisor.
419  bool DivRemConst = false;
420  bool DivRemConstPow2 = false;
421  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
422  if (const Constant *C = dyn_cast<Constant>(Args[1])) {
423  const ConstantInt *CVal =
424  (C->getType()->isVectorTy()
425  ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
426  : dyn_cast<const ConstantInt>(C));
427  if (CVal != nullptr &&
428  (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
429  DivRemConstPow2 = true;
430  else
431  DivRemConst = true;
432  }
433  }
434 
435  if (!Ty->isVectorTy()) {
436  // These FP operations are supported with a dedicated instruction for
437  // float, double and fp128 (base implementation assumes float generally
438  // costs 2).
439  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
440  Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
441  return 1;
442 
443  // There is no native support for FRem.
444  if (Opcode == Instruction::FRem)
445  return LIBCALL_COST;
446 
447  // Give discount for some combined logical operations if supported.
448  if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
449  if (Opcode == Instruction::Xor) {
450  for (const Value *A : Args) {
451  if (const Instruction *I = dyn_cast<Instruction>(A))
452  if (I->hasOneUse() &&
453  (I->getOpcode() == Instruction::And ||
454  I->getOpcode() == Instruction::Or ||
455  I->getOpcode() == Instruction::Xor))
456  return 0;
457  }
458  }
459  else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
460  for (const Value *A : Args) {
461  if (const Instruction *I = dyn_cast<Instruction>(A))
462  if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
463  return 0;
464  }
465  }
466  }
467 
468  // Or requires one instruction, although it has custom handling for i64.
469  if (Opcode == Instruction::Or)
470  return 1;
471 
472  if (Opcode == Instruction::Xor && ScalarBits == 1) {
473  if (ST->hasLoadStoreOnCond2())
474  return 5; // 2 * (li 0; loc 1); xor
475  return 7; // 2 * ipm sequences ; xor ; shift ; compare
476  }
477 
478  if (DivRemConstPow2)
479  return (SignedDivRem ? SDivPow2Cost : 1);
480  if (DivRemConst)
481  return DivMulSeqCost;
482  if (SignedDivRem || UnsignedDivRem)
483  return DivInstrCost;
484  }
485  else if (ST->hasVector()) {
486  auto *VTy = cast<FixedVectorType>(Ty);
487  unsigned VF = VTy->getNumElements();
488  unsigned NumVectors = getNumVectorRegs(Ty);
489 
490  // These vector operations are custom handled, but are still supported
491  // with one instruction per vector, regardless of element size.
492  if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
493  Opcode == Instruction::AShr) {
494  return NumVectors;
495  }
496 
497  if (DivRemConstPow2)
498  return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
499  if (DivRemConst) {
500  SmallVector<Type *> Tys(Args.size(), Ty);
501  return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
502  }
503  if ((SignedDivRem || UnsignedDivRem) && VF > 4)
504  // Temporary hack: disable high vectorization factors with integer
505  // division/remainder, which will get scalarized and handled with
506  // GR128 registers. The mischeduler is not clever enough to avoid
507  // spilling yet.
508  return 1000;
509 
510  // These FP operations are supported with a single vector instruction for
511  // double (base implementation assumes float generally costs 2). For
512  // FP128, the scalar cost is 1, and there is no overhead since the values
513  // are already in scalar registers.
514  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
515  Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
516  switch (ScalarBits) {
517  case 32: {
518  // The vector enhancements facility 1 provides v4f32 instructions.
519  if (ST->hasVectorEnhancements1())
520  return NumVectors;
521  // Return the cost of multiple scalar invocation plus the cost of
522  // inserting and extracting the values.
523  InstructionCost ScalarCost =
525  SmallVector<Type *> Tys(Args.size(), Ty);
526  InstructionCost Cost =
527  (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
528  // FIXME: VF 2 for these FP operations are currently just as
529  // expensive as for VF 4.
530  if (VF == 2)
531  Cost *= 2;
532  return Cost;
533  }
534  case 64:
535  case 128:
536  return NumVectors;
537  default:
538  break;
539  }
540  }
541 
542  // There is no native support for FRem.
543  if (Opcode == Instruction::FRem) {
544  SmallVector<Type *> Tys(Args.size(), Ty);
545  InstructionCost Cost =
546  (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
547  // FIXME: VF 2 for float is currently just as expensive as for VF 4.
548  if (VF == 2 && ScalarBits == 32)
549  Cost *= 2;
550  return Cost;
551  }
552  }
553 
554  // Fallback to the default implementation.
555  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
556  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
557 }
558 
560  VectorType *Tp,
561  ArrayRef<int> Mask, int Index,
562  VectorType *SubTp) {
564  if (ST->hasVector()) {
565  unsigned NumVectors = getNumVectorRegs(Tp);
566 
567  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
568 
569  // FP128 values are always in scalar registers, so there is no work
570  // involved with a shuffle, except for broadcast. In that case register
571  // moves are done with a single instruction per element.
572  if (Tp->getScalarType()->isFP128Ty())
573  return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
574 
575  switch (Kind) {
577  // ExtractSubvector Index indicates start offset.
578 
579  // Extracting a subvector from first index is a noop.
580  return (Index == 0 ? 0 : NumVectors);
581 
583  // Loop vectorizer calls here to figure out the extra cost of
584  // broadcasting a loaded value to all elements of a vector. Since vlrep
585  // loads and replicates with a single instruction, adjust the returned
586  // value.
587  return NumVectors - 1;
588 
589  default:
590 
591  // SystemZ supports single instruction permutation / replication.
592  return NumVectors;
593  }
594  }
595 
596  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
597 }
598 
599 // Return the log2 difference of the element sizes of the two vector types.
600 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
601  unsigned Bits0 = Ty0->getScalarSizeInBits();
602  unsigned Bits1 = Ty1->getScalarSizeInBits();
603 
604  if (Bits1 > Bits0)
605  return (Log2_32(Bits1) - Log2_32(Bits0));
606 
607  return (Log2_32(Bits0) - Log2_32(Bits1));
608 }
609 
610 // Return the number of instructions needed to truncate SrcTy to DstTy.
611 unsigned SystemZTTIImpl::
612 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
613  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
616  "Packing must reduce size of vector type.");
617  assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
618  cast<FixedVectorType>(DstTy)->getNumElements() &&
619  "Packing should not change number of elements.");
620 
621  // TODO: Since fp32 is expanded, the extract cost should always be 0.
622 
623  unsigned NumParts = getNumVectorRegs(SrcTy);
624  if (NumParts <= 2)
625  // Up to 2 vector registers can be truncated efficiently with pack or
626  // permute. The latter requires an immediate mask to be loaded, which
627  // typically gets hoisted out of a loop. TODO: return a good value for
628  // BB-VECTORIZER that includes the immediate loads, which we do not want
629  // to count for the loop vectorizer.
630  return 1;
631 
632  unsigned Cost = 0;
633  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
634  unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
635  for (unsigned P = 0; P < Log2Diff; ++P) {
636  if (NumParts > 1)
637  NumParts /= 2;
638  Cost += NumParts;
639  }
640 
641  // Currently, a general mix of permutes and pack instructions is output by
642  // isel, which follow the cost computation above except for this case which
643  // is one instruction less:
644  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
645  DstTy->getScalarSizeInBits() == 8)
646  Cost--;
647 
648  return Cost;
649 }
650 
651 // Return the cost of converting a vector bitmask produced by a compare
652 // (SrcTy), to the type of the select or extend instruction (DstTy).
653 unsigned SystemZTTIImpl::
655  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
656  "Should only be called with vector types.");
657 
658  unsigned PackCost = 0;
659  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
660  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
661  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
662  if (SrcScalarBits > DstScalarBits)
663  // The bitmask will be truncated.
664  PackCost = getVectorTruncCost(SrcTy, DstTy);
665  else if (SrcScalarBits < DstScalarBits) {
666  unsigned DstNumParts = getNumVectorRegs(DstTy);
667  // Each vector select needs its part of the bitmask unpacked.
668  PackCost = Log2Diff * DstNumParts;
669  // Extra cost for moving part of mask before unpacking.
670  PackCost += DstNumParts - 1;
671  }
672 
673  return PackCost;
674 }
675 
676 // Return the type of the compared operands. This is needed to compute the
677 // cost for a Select / ZExt or SExt instruction.
678 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
679  Type *OpTy = nullptr;
680  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
681  OpTy = CI->getOperand(0)->getType();
682  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
683  if (LogicI->getNumOperands() == 2)
684  if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
685  if (isa<CmpInst>(LogicI->getOperand(1)))
686  OpTy = CI0->getOperand(0)->getType();
687 
688  if (OpTy != nullptr) {
689  if (VF == 1) {
690  assert (!OpTy->isVectorTy() && "Expected scalar type");
691  return OpTy;
692  }
693  // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
694  // be either scalar or already vectorized with a same or lesser VF.
695  Type *ElTy = OpTy->getScalarType();
696  return FixedVectorType::get(ElTy, VF);
697  }
698 
699  return nullptr;
700 }
701 
702 // Get the cost of converting a boolean vector to a vector with same width
703 // and element size as Dst, plus the cost of zero extending if needed.
704 unsigned SystemZTTIImpl::
705 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
706  const Instruction *I) {
707  auto *DstVTy = cast<FixedVectorType>(Dst);
708  unsigned VF = DstVTy->getNumElements();
709  unsigned Cost = 0;
710  // If we know what the widths of the compared operands, get any cost of
711  // converting it to match Dst. Otherwise assume same widths.
712  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
713  if (CmpOpTy != nullptr)
714  Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
715  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
716  // One 'vn' per dst vector with an immediate mask.
717  Cost += getNumVectorRegs(Dst);
718  return Cost;
719 }
720 
722  Type *Src,
725  const Instruction *I) {
726  // FIXME: Can the logic below also be used for these cost kinds?
728  auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
729  return BaseCost == 0 ? BaseCost : 1;
730  }
731 
732  unsigned DstScalarBits = Dst->getScalarSizeInBits();
733  unsigned SrcScalarBits = Src->getScalarSizeInBits();
734 
735  if (!Src->isVectorTy()) {
736  assert (!Dst->isVectorTy());
737 
738  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
739  if (SrcScalarBits >= 32 ||
740  (I != nullptr && isa<LoadInst>(I->getOperand(0))))
741  return 1;
742  return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
743  }
744 
745  if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
746  Src->isIntegerTy(1)) {
747  if (ST->hasLoadStoreOnCond2())
748  return 2; // li 0; loc 1
749 
750  // This should be extension of a compare i1 result, which is done with
751  // ipm and a varying sequence of instructions.
752  unsigned Cost = 0;
753  if (Opcode == Instruction::SExt)
754  Cost = (DstScalarBits < 64 ? 3 : 4);
755  if (Opcode == Instruction::ZExt)
756  Cost = 3;
757  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
758  if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
759  // If operands of an fp-type was compared, this costs +1.
760  Cost++;
761  return Cost;
762  }
763  }
764  else if (ST->hasVector()) {
765  // Vector to scalar cast.
766  auto *SrcVecTy = cast<FixedVectorType>(Src);
767  auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
768  if (!DstVecTy) {
769  // TODO: tune vector-to-scalar cast.
770  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
771  }
772  unsigned VF = SrcVecTy->getNumElements();
773  unsigned NumDstVectors = getNumVectorRegs(Dst);
774  unsigned NumSrcVectors = getNumVectorRegs(Src);
775 
776  if (Opcode == Instruction::Trunc) {
777  if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
778  return 0; // Check for NOOP conversions.
779  return getVectorTruncCost(Src, Dst);
780  }
781 
782  if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
783  if (SrcScalarBits >= 8) {
784  // ZExt/SExt will be handled with one unpack per doubling of width.
785  unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
786 
787  // For types that spans multiple vector registers, some additional
788  // instructions are used to setup the unpacking.
789  unsigned NumSrcVectorOps =
790  (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
791  : (NumDstVectors / 2));
792 
793  return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
794  }
795  else if (SrcScalarBits == 1)
796  return getBoolVecToIntConversionCost(Opcode, Dst, I);
797  }
798 
799  if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
800  Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
801  // TODO: Fix base implementation which could simplify things a bit here
802  // (seems to miss on differentiating on scalar/vector types).
803 
804  // Only 64 bit vector conversions are natively supported before z15.
805  if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
806  if (SrcScalarBits == DstScalarBits)
807  return NumDstVectors;
808 
809  if (SrcScalarBits == 1)
810  return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
811  }
812 
813  // Return the cost of multiple scalar invocation plus the cost of
814  // inserting and extracting the values. Base implementation does not
815  // realize float->int gets scalarized.
816  InstructionCost ScalarCost = getCastInstrCost(
817  Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
818  InstructionCost TotCost = VF * ScalarCost;
819  bool NeedsInserts = true, NeedsExtracts = true;
820  // FP128 registers do not get inserted or extracted.
821  if (DstScalarBits == 128 &&
822  (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
823  NeedsInserts = false;
824  if (SrcScalarBits == 128 &&
825  (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
826  NeedsExtracts = false;
827 
828  TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
829  TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
830 
831  // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
832  if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
833  TotCost *= 2;
834 
835  return TotCost;
836  }
837 
838  if (Opcode == Instruction::FPTrunc) {
839  if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
840  return VF /*ldxbr/lexbr*/ +
841  getScalarizationOverhead(DstVecTy, true, false);
842  else // double -> float
843  return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
844  }
845 
846  if (Opcode == Instruction::FPExt) {
847  if (SrcScalarBits == 32 && DstScalarBits == 64) {
848  // float -> double is very rare and currently unoptimized. Instead of
849  // using vldeb, which can do two at a time, all conversions are
850  // scalarized.
851  return VF * 2;
852  }
853  // -> fp128. VF * lxdb/lxeb + extraction of elements.
854  return VF + getScalarizationOverhead(SrcVecTy, false, true);
855  }
856  }
857 
858  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
859 }
860 
861 // Scalar i8 / i16 operations will typically be made after first extending
862 // the operands to i32.
863 static unsigned getOperandsExtensionCost(const Instruction *I) {
864  unsigned ExtCost = 0;
865  for (Value *Op : I->operands())
866  // A load of i8 or i16 sign/zero extends to i32.
867  if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
868  ExtCost++;
869 
870  return ExtCost;
871 }
872 
874  Type *CondTy,
875  CmpInst::Predicate VecPred,
877  const Instruction *I) {
879  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
880 
881  if (!ValTy->isVectorTy()) {
882  switch (Opcode) {
883  case Instruction::ICmp: {
884  // A loaded value compared with 0 with multiple users becomes Load and
885  // Test. The load is then not foldable, so return 0 cost for the ICmp.
886  unsigned ScalarBits = ValTy->getScalarSizeInBits();
887  if (I != nullptr && ScalarBits >= 32)
888  if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
889  if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
890  if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
891  C->isZero())
892  return 0;
893 
894  unsigned Cost = 1;
895  if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
896  Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
897  return Cost;
898  }
899  case Instruction::Select:
900  if (ValTy->isFloatingPointTy())
901  return 4; // No load on condition for FP - costs a conditional jump.
902  return 1; // Load On Condition / Select Register.
903  }
904  }
905  else if (ST->hasVector()) {
906  unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
907 
908  // Called with a compare instruction.
909  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
910  unsigned PredicateExtraCost = 0;
911  if (I != nullptr) {
912  // Some predicates cost one or two extra instructions.
913  switch (cast<CmpInst>(I)->getPredicate()) {
914  case CmpInst::Predicate::ICMP_NE:
915  case CmpInst::Predicate::ICMP_UGE:
916  case CmpInst::Predicate::ICMP_ULE:
917  case CmpInst::Predicate::ICMP_SGE:
918  case CmpInst::Predicate::ICMP_SLE:
919  PredicateExtraCost = 1;
920  break;
921  case CmpInst::Predicate::FCMP_ONE:
922  case CmpInst::Predicate::FCMP_ORD:
923  case CmpInst::Predicate::FCMP_UEQ:
924  case CmpInst::Predicate::FCMP_UNO:
925  PredicateExtraCost = 2;
926  break;
927  default:
928  break;
929  }
930  }
931 
932  // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
933  // floats. FIXME: <2 x float> generates same code as <4 x float>.
934  unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
935  unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
936 
937  unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
938  return Cost;
939  }
940  else { // Called with a select instruction.
941  assert (Opcode == Instruction::Select);
942 
943  // We can figure out the extra cost of packing / unpacking if the
944  // instruction was passed and the compare instruction is found.
945  unsigned PackCost = 0;
946  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
947  if (CmpOpTy != nullptr)
948  PackCost =
949  getVectorBitmaskConversionCost(CmpOpTy, ValTy);
950 
951  return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
952  }
953  }
954 
955  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
956 }
957 
959  unsigned Index) {
960  // vlvgp will insert two grs into a vector register, so only count half the
961  // number of instructions.
962  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
963  return ((Index % 2 == 0) ? 1 : 0);
964 
965  if (Opcode == Instruction::ExtractElement) {
966  int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
967 
968  // Give a slight penalty for moving out of vector pipeline to FXU unit.
969  if (Index == 0 && Val->isIntOrIntVectorTy())
970  Cost += 1;
971 
972  return Cost;
973  }
974 
975  return BaseT::getVectorInstrCost(Opcode, Val, Index);
976 }
977 
978 // Check if a load may be folded as a memory operand in its user.
979 bool SystemZTTIImpl::
980 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
981  if (!Ld->hasOneUse())
982  return false;
983  FoldedValue = Ld;
984  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
985  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
986  unsigned TruncBits = 0;
987  unsigned SExtBits = 0;
988  unsigned ZExtBits = 0;
989  if (UserI->hasOneUse()) {
990  unsigned UserBits = UserI->getType()->getScalarSizeInBits();
991  if (isa<TruncInst>(UserI))
992  TruncBits = UserBits;
993  else if (isa<SExtInst>(UserI))
994  SExtBits = UserBits;
995  else if (isa<ZExtInst>(UserI))
996  ZExtBits = UserBits;
997  }
998  if (TruncBits || SExtBits || ZExtBits) {
999  FoldedValue = UserI;
1000  UserI = cast<Instruction>(*UserI->user_begin());
1001  // Load (single use) -> trunc/extend (single use) -> UserI
1002  }
1003  if ((UserI->getOpcode() == Instruction::Sub ||
1004  UserI->getOpcode() == Instruction::SDiv ||
1005  UserI->getOpcode() == Instruction::UDiv) &&
1006  UserI->getOperand(1) != FoldedValue)
1007  return false; // Not commutative, only RHS foldable.
1008  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1009  // extension was made of the load.
1010  unsigned LoadOrTruncBits =
1011  ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1012  switch (UserI->getOpcode()) {
1013  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1014  case Instruction::Sub:
1015  case Instruction::ICmp:
1016  if (LoadedBits == 32 && ZExtBits == 64)
1017  return true;
1019  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1020  if (UserI->getOpcode() != Instruction::ICmp) {
1021  if (LoadedBits == 16 &&
1022  (SExtBits == 32 ||
1023  (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1024  return true;
1025  if (LoadOrTruncBits == 16)
1026  return true;
1027  }
1029  case Instruction::SDiv:// SE: 32->64
1030  if (LoadedBits == 32 && SExtBits == 64)
1031  return true;
1033  case Instruction::UDiv:
1034  case Instruction::And:
1035  case Instruction::Or:
1036  case Instruction::Xor:
1037  // This also makes sense for float operations, but disabled for now due
1038  // to regressions.
1039  // case Instruction::FCmp:
1040  // case Instruction::FAdd:
1041  // case Instruction::FSub:
1042  // case Instruction::FMul:
1043  // case Instruction::FDiv:
1044 
1045  // All possible extensions of memory checked above.
1046 
1047  // Comparison between memory and immediate.
1048  if (UserI->getOpcode() == Instruction::ICmp)
1049  if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1050  if (CI->getValue().isIntN(16))
1051  return true;
1052  return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1053  break;
1054  }
1055  return false;
1056 }
1057 
1058 static bool isBswapIntrinsicCall(const Value *V) {
1059  if (const Instruction *I = dyn_cast<Instruction>(V))
1060  if (auto *CI = dyn_cast<CallInst>(I))
1061  if (auto *F = CI->getCalledFunction())
1062  if (F->getIntrinsicID() == Intrinsic::bswap)
1063  return true;
1064  return false;
1065 }
1066 
1068  MaybeAlign Alignment,
1069  unsigned AddressSpace,
1071  const Instruction *I) {
1072  assert(!Src->isVoidTy() && "Invalid type");
1073 
1074  // TODO: Handle other cost kinds.
1076  return 1;
1077 
1078  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1079  // Store the load or its truncated or extended value in FoldedValue.
1080  const Instruction *FoldedValue = nullptr;
1081  if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1082  const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1083  assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1084 
1085  // UserI can't fold two loads, so in that case return 0 cost only
1086  // half of the time.
1087  for (unsigned i = 0; i < 2; ++i) {
1088  if (UserI->getOperand(i) == FoldedValue)
1089  continue;
1090 
1091  if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1092  LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1093  if (!OtherLoad &&
1094  (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1095  isa<ZExtInst>(OtherOp)))
1096  OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1097  if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1098  return i == 0; // Both operands foldable.
1099  }
1100  }
1101 
1102  return 0; // Only I is foldable in user.
1103  }
1104  }
1105 
1106  unsigned NumOps =
1107  (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1108 
1109  // Store/Load reversed saves one instruction.
1110  if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1111  I != nullptr) {
1112  if (Opcode == Instruction::Load && I->hasOneUse()) {
1113  const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1114  // In case of load -> bswap -> store, return normal cost for the load.
1115  if (isBswapIntrinsicCall(LdUser) &&
1116  (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1117  return 0;
1118  }
1119  else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1120  const Value *StoredVal = SI->getValueOperand();
1121  if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1122  return 0;
1123  }
1124  }
1125 
1126  if (Src->getScalarSizeInBits() == 128)
1127  // 128 bit scalars are held in a pair of two 64 bit registers.
1128  NumOps *= 2;
1129 
1130  return NumOps;
1131 }
1132 
1133 // The generic implementation of getInterleavedMemoryOpCost() is based on
1134 // adding costs of the memory operations plus all the extracts and inserts
1135 // needed for using / defining the vector operands. The SystemZ version does
1136 // roughly the same but bases the computations on vector permutations
1137 // instead.
1139  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1140  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1141  bool UseMaskForCond, bool UseMaskForGaps) {
1142  if (UseMaskForCond || UseMaskForGaps)
1143  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1144  Alignment, AddressSpace, CostKind,
1145  UseMaskForCond, UseMaskForGaps);
1146  assert(isa<VectorType>(VecTy) &&
1147  "Expect a vector type for interleaved memory op");
1148 
1149  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1150  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1151  unsigned VF = NumElts / Factor;
1152  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1153  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1154  unsigned NumPermutes = 0;
1155 
1156  if (Opcode == Instruction::Load) {
1157  // Loading interleave groups may have gaps, which may mean fewer
1158  // loads. Find out how many vectors will be loaded in total, and in how
1159  // many of them each value will be in.
1160  BitVector UsedInsts(NumVectorMemOps, false);
1161  std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1162  for (unsigned Index : Indices)
1163  for (unsigned Elt = 0; Elt < VF; ++Elt) {
1164  unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1165  UsedInsts.set(Vec);
1166  ValueVecs[Index].set(Vec);
1167  }
1168  NumVectorMemOps = UsedInsts.count();
1169 
1170  for (unsigned Index : Indices) {
1171  // Estimate that each loaded source vector containing this Index
1172  // requires one operation, except that vperm can handle two input
1173  // registers first time for each dst vector.
1174  unsigned NumSrcVecs = ValueVecs[Index].count();
1175  unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1176  assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1177  NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1178  }
1179  } else {
1180  // Estimate the permutes for each stored vector as the smaller of the
1181  // number of elements and the number of source vectors. Subtract one per
1182  // dst vector for vperm (S.A.).
1183  unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1184  unsigned NumDstVecs = NumVectorMemOps;
1185  assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1186  NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1187  }
1188 
1189  // Cost of load/store operations and the permutations needed.
1190  return NumVectorMemOps + NumPermutes;
1191 }
1192 
1194  if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1195  return getNumVectorRegs(RetTy); // VPERM
1196  return -1;
1197 }
1198 
1202  InstructionCost Cost =
1204  if (Cost != -1)
1205  return Cost;
1207 }
llvm::SystemZTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: SystemZTargetTransformInfo.cpp:1138
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:592
llvm::BasicTTIImplBase< SystemZTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:485
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition: TargetTransformInfo.h:418
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:453
llvm::SystemZSubtarget::hasLoadStoreOnCond2
bool hasLoadStoreOnCond2() const
Definition: SystemZSubtarget.h:142
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::SystemZTTIImpl::getBoolVecToIntConversionCost
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:705
llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:150
llvm::InstructionCost::getValue
Optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:87
getVectorIntrinsicInstrCost
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy)
Definition: SystemZTargetTransformInfo.cpp:1193
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:469
IntrinsicInst.h
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
llvm::Function
Definition: Function.h:61
llvm::SystemZSubtarget::hasVector
bool hasVector() const
Definition: SystemZSubtarget.h:208
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BitVector::set
BitVector & set()
Definition: BitVector.h:343
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:592
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:429
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1467
llvm::BasicTTIImplBase< SystemZTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:750
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:461
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:907
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition: TargetTransformInfo.h:420
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:481
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1403
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:535
llvm::BasicTTIImplBase< SystemZTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:833
llvm::TargetTransformInfo::UnrollingPreferences::FullUnrollMaxCount
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
Definition: TargetTransformInfo.h:473
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::SystemZInstrInfo
Definition: SystemZInstrInfo.h:174
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:162
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::UnrollingPreferences::AllowExpensiveTripCount
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
Definition: TargetTransformInfo.h:490
llvm::SystemZTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: SystemZTargetTransformInfo.cpp:318
llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition: TargetTransformInfo.h:417
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:860
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition: TargetTransformInfo.h:419
getCmpOpsType
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
Definition: SystemZTargetTransformInfo.cpp:678
llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition: TargetTransformInfo.h:423
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::SystemZTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: SystemZTargetTransformInfo.cpp:958
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::SystemZSubtarget::hasMiscellaneousExtensions3
bool hasMiscellaneousExtensions3() const
Definition: SystemZSubtarget.h:238
getScalarSizeInBits
static unsigned getScalarSizeInBits(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:367
llvm::SystemZTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:721
llvm::BasicTTIImplBase< SystemZTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1108
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:859
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1062
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:493
llvm::BitVector::count
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:154
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:237
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:596
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1453
llvm::SystemZTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:1200
llvm::BitVector
Definition: BitVector.h:74
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:907
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1060
llvm::SystemZTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:873
getNumVectorRegs
static unsigned getNumVectorRegs(Type *Ty)
Definition: SystemZTargetTransformInfo.cpp:377
llvm::BasicTTIImplBase< SystemZTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:890
llvm::None
const NoneType None
Definition: None.h:23
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:592
llvm::SystemZSubtarget::hasVectorEnhancements2
bool hasVectorEnhancements2() const
Definition: SystemZSubtarget.h:247
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:118
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:710
llvm::SystemZTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: SystemZTargetTransformInfo.cpp:330
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:201
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::SystemZTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: SystemZTargetTransformInfo.cpp:384
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::divideCeil
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:742
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::isInt< 32 >
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:373
llvm::SystemZTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: SystemZTargetTransformInfo.cpp:301
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
getOperandsExtensionCost
static unsigned getOperandsExtensionCost(const Instruction *I)
Definition: SystemZTargetTransformInfo.cpp:863
llvm::TargetTransformInfo::LSRCost
Definition: TargetTransformInfo.h:414
llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition: TargetLowering.h:895
llvm::SystemZTTIImpl::isFoldableLoad
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
Definition: SystemZTargetTransformInfo.cpp:980
llvm::SystemZTTIImpl::getVectorBitmaskConversionCost
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:654
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:428
llvm::M68kBeads::Bits1
@ Bits1
Definition: M68kBaseInfo.h:54
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:886
llvm::SystemZTTIImpl::getVectorTruncCost
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
Definition: SystemZTargetTransformInfo.cpp:612
llvm::isUInt< 32 >
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:411
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::SystemZTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: SystemZTargetTransformInfo.cpp:559
SystemZTargetTransformInfo.h
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SystemZTTIImpl::isLSRCostLess
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
Definition: SystemZTargetTransformInfo.cpp:306
isBswapIntrinsicCall
static bool isBswapIntrinsicCall(const Value *V)
Definition: SystemZTargetTransformInfo.cpp:1058
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:119
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
getElSizeLog2Diff
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
Definition: SystemZTargetTransformInfo.cpp:600
llvm::SystemZSubtarget::hasVectorEnhancements1
bool hasVectorEnhancements1() const
Definition: SystemZSubtarget.h:227
llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition: TargetTransformInfo.h:424
llvm::BasicTTIImplBase< SystemZTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1176
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:878
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::SystemZTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:33
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::SystemZTTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: SystemZTargetTransformInfo.cpp:187
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::BasicTTIImplBase< SystemZTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:671
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:464
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:234
llvm::BasicTTIImplBase< SystemZTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:559
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:273
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::isInt< 16 >
constexpr bool isInt< 16 >(int64_t x)
Definition: MathExtras.h:370
llvm::PPC::getPredicate
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
Definition: PPCPredicates.h:87
llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition: TargetTransformInfo.h:421
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:147
llvm::SystemZTTIImpl::getMinPrefetchStride
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
Definition: SystemZTargetTransformInfo.cpp:343
llvm::SystemZSubtarget::hasPopulationCount
bool hasPopulationCount() const
Definition: SystemZSubtarget.h:151
llvm::SystemZSubtarget::hasMiscellaneousExtensions2
bool hasMiscellaneousExtensions2() const
Definition: SystemZSubtarget.h:211
CostTable.h
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:207
llvm::TypeSize
Definition: TypeSize.h:417
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::SystemZSubtarget::getInstrInfo
const SystemZInstrInfo * getInstrInfo() const override
Definition: SystemZSubtarget.h:107
llvm::SystemZTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: SystemZTargetTransformInfo.cpp:1067
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:907
Vector
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:907
llvm::BasicTTIImplBase< SystemZTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:866
llvm::BasicTTIImplBase< SystemZTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1332
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::BasicTTIImplBase< SystemZTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp)
Definition: BasicTTIImpl.h:2001
TargetTransformInfo.h
llvm::SystemZTTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: SystemZTargetTransformInfo.cpp:238
llvm::SystemZTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: SystemZTargetTransformInfo.cpp:66
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1409
llvm::SystemZTTIImpl::hasDivRemOp
bool hasDivRemOp(Type *DataType, bool IsSigned)
Definition: SystemZTargetTransformInfo.cpp:360
llvm::SystemZTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: SystemZTargetTransformInfo.cpp:245
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:156
Debug.h
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:867
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:128
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37