LLVM 17.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
56 "amdgpu-use-legacy-divergence-analysis",
57 cl::desc("Enable legacy divergence analysis for AMDGPU"),
58 cl::init(false), cl::Hidden);
59
61 "amdgpu-unroll-max-block-to-analyze",
62 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
63 cl::init(32), cl::Hidden);
64
65static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
66 cl::Hidden, cl::init(4000),
67 cl::desc("Cost of alloca argument"));
68
69// If the amount of scratch memory to eliminate exceeds our ability to allocate
70// it into registers we gain nothing by aggressively inlining functions for that
71// heuristic.
73 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
74 cl::init(256),
75 cl::desc("Maximum alloca size to use for inline cost"));
76
77// Inliner constraint to achieve reasonable compilation time.
79 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
80 cl::desc("Maximum number of BBs allowed in a function after inlining"
81 " (compile time constraint)"));
82
83static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
84 unsigned Depth = 0) {
85 const Instruction *I = dyn_cast<Instruction>(Cond);
86 if (!I)
87 return false;
88
89 for (const Value *V : I->operand_values()) {
90 if (!L->contains(I))
91 continue;
92 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
93 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
94 return SubLoop->contains(PHI); }))
95 return true;
96 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
97 return true;
98 }
99 return false;
100}
101
103 : BaseT(TM, F.getParent()->getDataLayout()),
104 TargetTriple(TM->getTargetTriple()),
105 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
106 TLI(ST->getTargetLowering()) {}
107
111 const Function &F = *L->getHeader()->getParent();
112 UP.Threshold =
113 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
114 UP.MaxCount = std::numeric_limits<unsigned>::max();
115 UP.Partial = true;
116
117 // Conditional branch in a loop back edge needs 3 additional exec
118 // manipulations in average.
119 UP.BEInsns += 3;
120
121 // TODO: Do we want runtime unrolling?
122
123 // Maximum alloca size than can fit registers. Reserve 16 registers.
124 const unsigned MaxAlloca = (256 - 16) * 4;
125 unsigned ThresholdPrivate = UnrollThresholdPrivate;
126 unsigned ThresholdLocal = UnrollThresholdLocal;
127
128 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
129 // provided threshold value as the default for Threshold
130 if (MDNode *LoopUnrollThreshold =
131 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
132 if (LoopUnrollThreshold->getNumOperands() == 2) {
133 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
134 LoopUnrollThreshold->getOperand(1));
135 if (MetaThresholdValue) {
136 // We will also use the supplied value for PartialThreshold for now.
137 // We may introduce additional metadata if it becomes necessary in the
138 // future.
139 UP.Threshold = MetaThresholdValue->getSExtValue();
141 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
142 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
143 }
144 }
145 }
146
147 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
148 for (const BasicBlock *BB : L->getBlocks()) {
149 const DataLayout &DL = BB->getModule()->getDataLayout();
150 unsigned LocalGEPsSeen = 0;
151
152 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
153 return SubLoop->contains(BB); }))
154 continue; // Block belongs to an inner loop.
155
156 for (const Instruction &I : *BB) {
157 // Unroll a loop which contains an "if" statement whose condition
158 // defined by a PHI belonging to the loop. This may help to eliminate
159 // if region and potentially even PHI itself, saving on both divergence
160 // and registers used for the PHI.
161 // Add a small bonus for each of such "if" statements.
162 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
163 if (UP.Threshold < MaxBoost && Br->isConditional()) {
164 BasicBlock *Succ0 = Br->getSuccessor(0);
165 BasicBlock *Succ1 = Br->getSuccessor(1);
166 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
167 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
168 continue;
169 if (dependsOnLocalPhi(L, Br->getCondition())) {
171 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
172 << " for loop:\n"
173 << *L << " due to " << *Br << '\n');
174 if (UP.Threshold >= MaxBoost)
175 return;
176 }
177 }
178 continue;
179 }
180
181 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
182 if (!GEP)
183 continue;
184
185 unsigned AS = GEP->getAddressSpace();
186 unsigned Threshold = 0;
188 Threshold = ThresholdPrivate;
190 Threshold = ThresholdLocal;
191 else
192 continue;
193
194 if (UP.Threshold >= Threshold)
195 continue;
196
197 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
198 const Value *Ptr = GEP->getPointerOperand();
199 const AllocaInst *Alloca =
200 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
201 if (!Alloca || !Alloca->isStaticAlloca())
202 continue;
203 Type *Ty = Alloca->getAllocatedType();
204 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
205 if (AllocaSize > MaxAlloca)
206 continue;
207 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
209 LocalGEPsSeen++;
210 // Inhibit unroll for local memory if we have seen addressing not to
211 // a variable, most likely we will be unable to combine it.
212 // Do not unroll too deep inner loops for local memory to give a chance
213 // to unroll an outer loop for a more important reason.
214 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
215 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
216 !isa<Argument>(GEP->getPointerOperand())))
217 continue;
218 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
219 << *L << " due to LDS use.\n");
221 }
222
223 // Check if GEP depends on a value defined by this loop itself.
224 bool HasLoopDef = false;
225 for (const Value *Op : GEP->operands()) {
226 const Instruction *Inst = dyn_cast<Instruction>(Op);
227 if (!Inst || L->isLoopInvariant(Op))
228 continue;
229
230 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
231 return SubLoop->contains(Inst); }))
232 continue;
233 HasLoopDef = true;
234 break;
235 }
236 if (!HasLoopDef)
237 continue;
238
239 // We want to do whatever we can to limit the number of alloca
240 // instructions that make it through to the code generator. allocas
241 // require us to use indirect addressing, which is slow and prone to
242 // compiler bugs. If this loop does an address calculation on an
243 // alloca ptr, then we want to use a higher than normal loop unroll
244 // threshold. This will give SROA a better chance to eliminate these
245 // allocas.
246 //
247 // We also want to have more unrolling for local memory to let ds
248 // instructions with different offsets combine.
249 //
250 // Don't use the maximum allowed value here as it will make some
251 // programs way too big.
252 UP.Threshold = Threshold;
253 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
254 << " for loop:\n"
255 << *L << " due to " << *GEP << '\n');
256 if (UP.Threshold >= MaxBoost)
257 return;
258 }
259
260 // If we got a GEP in a small BB from inner loop then increase max trip
261 // count to analyze for better estimation cost in unroll
262 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
264 }
265}
266
270}
271
272const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
273 // Codegen control options which don't matter.
274 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
275 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
276 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
277 AMDGPU::FeatureUnalignedAccessMode,
278
279 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
280
281 // Property of the kernel/environment which can't actually differ.
282 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
283 AMDGPU::FeatureTrapHandler,
284
285 // The default assumption needs to be ecc is enabled, but no directly
286 // exposed operations depend on it, so it can be safely inlined.
287 AMDGPU::FeatureSRAMECC,
288
289 // Perf-tuning features
290 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
291
293 : BaseT(TM, F.getParent()->getDataLayout()),
294 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
295 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
296 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
298 HasFP32Denormals = Mode.allFP32Denormals();
299 HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
300}
301
302unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
303 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
304 // registers. See getRegisterClassForType for the implementation.
305 // In this case vector registers are not vector in terms of
306 // VGPRs, but those which can hold multiple values.
307
308 // This is really the number of registers to fill when vectorizing /
309 // interleaving loops, so we lie to avoid trying to use all registers.
310 return 4;
311}
312
315 switch (K) {
317 return TypeSize::getFixed(32);
319 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
321 return TypeSize::getScalable(0);
322 }
323 llvm_unreachable("Unsupported register kind");
324}
325
327 return 32;
328}
329
330unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
331 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
332 return 32 * 4 / ElemWidth;
333 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
334 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
335 : 1;
336}
337
338unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
339 unsigned ChainSizeInBytes,
340 VectorType *VecTy) const {
341 unsigned VecRegBitWidth = VF * LoadSize;
342 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
343 // TODO: Support element-size less than 32bit?
344 return 128 / LoadSize;
345
346 return VF;
347}
348
349unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
350 unsigned ChainSizeInBytes,
351 VectorType *VecTy) const {
352 unsigned VecRegBitWidth = VF * StoreSize;
353 if (VecRegBitWidth > 128)
354 return 128 / StoreSize;
355
356 return VF;
357}
358
359unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
360 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
361 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
363 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
364 return 512;
365 }
366
367 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
368 return 8 * ST->getMaxPrivateElementSize();
369
370 // Common to flat, global, local and region. Assume for unknown addrspace.
371 return 128;
372}
373
374bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
375 Align Alignment,
376 unsigned AddrSpace) const {
377 // We allow vectorization of flat stores, even though we may need to decompose
378 // them later if they may access private memory. We don't have enough context
379 // here, and legalization can handle it.
380 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
381 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
382 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
383 }
384 return true;
385}
386
387bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
388 Align Alignment,
389 unsigned AddrSpace) const {
390 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
391}
392
393bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
394 Align Alignment,
395 unsigned AddrSpace) const {
396 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
397}
398
399// FIXME: Really we would like to issue multiple 128-bit loads and stores per
400// iteration. Should we report a larger size and let it legalize?
401//
402// FIXME: Should we use narrower types for local/region, or account for when
403// unaligned access is legal?
404//
405// FIXME: This could use fine tuning and microbenchmarks.
407 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
408 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
409 std::optional<uint32_t> AtomicElementSize) const {
410
411 if (AtomicElementSize)
412 return Type::getIntNTy(Context, *AtomicElementSize * 8);
413
414 unsigned MinAlign = std::min(SrcAlign, DestAlign);
415
416 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
417 // hardware into byte accesses. If you assume all alignments are equally
418 // probable, it's more efficient on average to use short accesses for this
419 // case.
420 if (MinAlign == 2)
422
423 // Not all subtargets have 128-bit DS instructions, and we currently don't
424 // form them by default.
425 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
426 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
427 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
428 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
430 }
431
432 // Global memory works best with 16-byte accesses. Private memory will also
433 // hit this, although they'll be decomposed.
435}
436
438 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
439 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
440 unsigned SrcAlign, unsigned DestAlign,
441 std::optional<uint32_t> AtomicCpySize) const {
442 assert(RemainingBytes < 16);
443
444 if (AtomicCpySize)
446 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
447 DestAlign, AtomicCpySize);
448
449 unsigned MinAlign = std::min(SrcAlign, DestAlign);
450
451 if (MinAlign != 2) {
452 Type *I64Ty = Type::getInt64Ty(Context);
453 while (RemainingBytes >= 8) {
454 OpsOut.push_back(I64Ty);
455 RemainingBytes -= 8;
456 }
457
458 Type *I32Ty = Type::getInt32Ty(Context);
459 while (RemainingBytes >= 4) {
460 OpsOut.push_back(I32Ty);
461 RemainingBytes -= 4;
462 }
463 }
464
465 Type *I16Ty = Type::getInt16Ty(Context);
466 while (RemainingBytes >= 2) {
467 OpsOut.push_back(I16Ty);
468 RemainingBytes -= 2;
469 }
470
472 while (RemainingBytes) {
473 OpsOut.push_back(I8Ty);
474 --RemainingBytes;
475 }
476}
477
479 // Disable unrolling if the loop is not vectorized.
480 // TODO: Enable this again.
481 if (VF.isScalar())
482 return 1;
483
484 return 8;
485}
486
488 MemIntrinsicInfo &Info) const {
489 switch (Inst->getIntrinsicID()) {
490 case Intrinsic::amdgcn_atomic_inc:
491 case Intrinsic::amdgcn_atomic_dec:
492 case Intrinsic::amdgcn_ds_ordered_add:
493 case Intrinsic::amdgcn_ds_ordered_swap:
494 case Intrinsic::amdgcn_ds_fadd:
495 case Intrinsic::amdgcn_ds_fmin:
496 case Intrinsic::amdgcn_ds_fmax: {
497 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
498 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
499 if (!Ordering || !Volatile)
500 return false; // Invalid.
501
502 unsigned OrderingVal = Ordering->getZExtValue();
503 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
504 return false;
505
506 Info.PtrVal = Inst->getArgOperand(0);
507 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
508 Info.ReadMem = true;
509 Info.WriteMem = true;
510 Info.IsVolatile = !Volatile->isZero();
511 return true;
512 }
513 default:
514 return false;
515 }
516}
517
519 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
522 const Instruction *CxtI) {
523
524 // Legalize the type.
525 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
526 int ISD = TLI->InstructionOpcodeToISD(Opcode);
527
528 // Because we don't have any legal vector operations, but the legal types, we
529 // need to account for split vectors.
530 unsigned NElts = LT.second.isVector() ?
531 LT.second.getVectorNumElements() : 1;
532
533 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
534
535 switch (ISD) {
536 case ISD::SHL:
537 case ISD::SRL:
538 case ISD::SRA:
539 if (SLT == MVT::i64)
540 return get64BitInstrCost(CostKind) * LT.first * NElts;
541
542 if (ST->has16BitInsts() && SLT == MVT::i16)
543 NElts = (NElts + 1) / 2;
544
545 // i32
546 return getFullRateInstrCost() * LT.first * NElts;
547 case ISD::ADD:
548 case ISD::SUB:
549 case ISD::AND:
550 case ISD::OR:
551 case ISD::XOR:
552 if (SLT == MVT::i64) {
553 // and, or and xor are typically split into 2 VALU instructions.
554 return 2 * getFullRateInstrCost() * LT.first * NElts;
555 }
556
557 if (ST->has16BitInsts() && SLT == MVT::i16)
558 NElts = (NElts + 1) / 2;
559
560 return LT.first * NElts * getFullRateInstrCost();
561 case ISD::MUL: {
562 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
563 if (SLT == MVT::i64) {
564 const int FullRateCost = getFullRateInstrCost();
565 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 // i32
572 return QuarterRateCost * NElts * LT.first;
573 }
574 case ISD::FMUL:
575 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
576 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
577 // fused operation.
578 if (CxtI && CxtI->hasOneUse())
579 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
580 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
581 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
582 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
584 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
586
587 // Estimate all types may be fused with contract/unsafe flags
589 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
590 Options.UnsafeFPMath ||
591 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
593 }
594 }
595 [[fallthrough]];
596 case ISD::FADD:
597 case ISD::FSUB:
598 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
599 NElts = (NElts + 1) / 2;
600 if (SLT == MVT::f64)
601 return LT.first * NElts * get64BitInstrCost(CostKind);
602
603 if (ST->has16BitInsts() && SLT == MVT::f16)
604 NElts = (NElts + 1) / 2;
605
606 if (SLT == MVT::f32 || SLT == MVT::f16)
607 return LT.first * NElts * getFullRateInstrCost();
608 break;
609 case ISD::FDIV:
610 case ISD::FREM:
611 // FIXME: frem should be handled separately. The fdiv in it is most of it,
612 // but the current lowering is also not entirely correct.
613 if (SLT == MVT::f64) {
614 int Cost = 7 * get64BitInstrCost(CostKind) +
615 getQuarterRateInstrCost(CostKind) +
616 3 * getHalfRateInstrCost(CostKind);
617 // Add cost of workaround.
619 Cost += 3 * getFullRateInstrCost();
620
621 return LT.first * Cost * NElts;
622 }
623
624 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
625 // TODO: This is more complicated, unsafe flags etc.
626 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
627 (SLT == MVT::f16 && ST->has16BitInsts())) {
628 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
629 }
630 }
631
632 if (SLT == MVT::f16 && ST->has16BitInsts()) {
633 // 2 x v_cvt_f32_f16
634 // f32 rcp
635 // f32 fmul
636 // v_cvt_f16_f32
637 // f16 div_fixup
638 int Cost =
639 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
640 return LT.first * Cost * NElts;
641 }
642
643 if (SLT == MVT::f32 || SLT == MVT::f16) {
644 // 4 more v_cvt_* insts without f16 insts support
645 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
646 1 * getQuarterRateInstrCost(CostKind);
647
648 if (!HasFP32Denormals) {
649 // FP mode switches.
650 Cost += 2 * getFullRateInstrCost();
651 }
652
653 return LT.first * NElts * Cost;
654 }
655 break;
656 case ISD::FNEG:
657 // Use the backend' estimation. If fneg is not free each element will cost
658 // one additional instruction.
659 return TLI->isFNegFree(SLT) ? 0 : NElts;
660 default:
661 break;
662 }
663
664 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
665 Args, CxtI);
666}
667
668// Return true if there's a potential benefit from using v2f16/v2i16
669// instructions for an intrinsic, even if it requires nontrivial legalization.
671 switch (ID) {
672 case Intrinsic::fma: // TODO: fmuladd
673 // There's a small benefit to using vector ops in the legalized code.
674 case Intrinsic::round:
675 case Intrinsic::uadd_sat:
676 case Intrinsic::usub_sat:
677 case Intrinsic::sadd_sat:
678 case Intrinsic::ssub_sat:
679 return true;
680 default:
681 return false;
682 }
683}
684
688 if (ICA.getID() == Intrinsic::fabs)
689 return 0;
690
693
694 Type *RetTy = ICA.getReturnType();
695
696 // Legalize the type.
697 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
698
699 unsigned NElts = LT.second.isVector() ?
700 LT.second.getVectorNumElements() : 1;
701
702 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
703
704 if (SLT == MVT::f64)
705 return LT.first * NElts * get64BitInstrCost(CostKind);
706
707 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
708 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
709 NElts = (NElts + 1) / 2;
710
711 // TODO: Get more refined intrinsic costs?
712 unsigned InstRate = getQuarterRateInstrCost(CostKind);
713
714 switch (ICA.getID()) {
715 case Intrinsic::fma:
716 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
717 : getQuarterRateInstrCost(CostKind);
718 break;
719 case Intrinsic::uadd_sat:
720 case Intrinsic::usub_sat:
721 case Intrinsic::sadd_sat:
722 case Intrinsic::ssub_sat:
723 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
724 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
725 NElts = 1;
726 break;
727 }
728
729 return LT.first * NElts * InstRate;
730}
731
734 const Instruction *I) {
735 assert((I == nullptr || I->getOpcode() == Opcode) &&
736 "Opcode should reflect passed instruction.");
737 const bool SCost =
739 const int CBrCost = SCost ? 5 : 7;
740 switch (Opcode) {
741 case Instruction::Br: {
742 // Branch instruction takes about 4 slots on gfx900.
743 auto BI = dyn_cast_or_null<BranchInst>(I);
744 if (BI && BI->isUnconditional())
745 return SCost ? 1 : 4;
746 // Suppose conditional branch takes additional 3 exec manipulations
747 // instructions in average.
748 return CBrCost;
749 }
750 case Instruction::Switch: {
751 auto SI = dyn_cast_or_null<SwitchInst>(I);
752 // Each case (including default) takes 1 cmp + 1 cbr instructions in
753 // average.
754 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
755 }
756 case Instruction::Ret:
757 return SCost ? 1 : 10;
758 }
759 return BaseT::getCFInstrCost(Opcode, CostKind, I);
760}
761
764 std::optional<FastMathFlags> FMF,
767 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
768
769 EVT OrigTy = TLI->getValueType(DL, Ty);
770
771 // Computes cost on targets that have packed math instructions(which support
772 // 16-bit types only).
773 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
774 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
775
776 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
777 return LT.first * getFullRateInstrCost();
778}
779
782 bool IsUnsigned,
784 EVT OrigTy = TLI->getValueType(DL, Ty);
785
786 // Computes cost on targets that have packed math instructions(which support
787 // 16-bit types only).
788 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
789 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
790
791 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
792 return LT.first * getHalfRateInstrCost(CostKind);
793}
794
797 unsigned Index, Value *Op0,
798 Value *Op1) {
799 switch (Opcode) {
800 case Instruction::ExtractElement:
801 case Instruction::InsertElement: {
802 unsigned EltSize
803 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
804 if (EltSize < 32) {
805 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
806 return 0;
807 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
808 Op1);
809 }
810
811 // Extracts are just reads of a subregister, so are free. Inserts are
812 // considered free because we don't want to have any cost for scalarizing
813 // operations, and we don't have to copy into a different register class.
814
815 // Dynamic indexing isn't free and is best avoided.
816 return Index == ~0u ? 2 : 0;
817 }
818 default:
819 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
820 }
821}
822
823/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
824/// this is analyzing the collective result of all output registers. Otherwise,
825/// this is only querying a specific result index if this returns multiple
826/// registers in a struct.
828 const CallInst *CI, ArrayRef<unsigned> Indices) const {
829 // TODO: Handle complex extract indices
830 if (Indices.size() > 1)
831 return true;
832
833 const DataLayout &DL = CI->getModule()->getDataLayout();
834 const SIRegisterInfo *TRI = ST->getRegisterInfo();
835 TargetLowering::AsmOperandInfoVector TargetConstraints =
836 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
837
838 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
839
840 int OutputIdx = 0;
841 for (auto &TC : TargetConstraints) {
842 if (TC.Type != InlineAsm::isOutput)
843 continue;
844
845 // Skip outputs we don't care about.
846 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
847 continue;
848
850
852 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
853
854 // For AGPR constraints null is returned on subtargets without AGPRs, so
855 // assume divergent for null.
856 if (!RC || !TRI->isSGPRClass(RC))
857 return true;
858 }
859
860 return false;
861}
862
863/// \returns true if the new GPU divergence analysis is enabled.
865 return !UseLegacyDA;
866}
867
869 const IntrinsicInst *ReadReg) const {
870 Metadata *MD =
871 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
873 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
874
875 // Special case registers that look like VCC.
876 MVT VT = MVT::getVT(ReadReg->getType());
877 if (VT == MVT::i1)
878 return true;
879
880 // Special case scalar registers that start with 'v'.
881 if (RegName.startswith("vcc") || RegName.empty())
882 return false;
883
884 // VGPR or AGPR is divergent. There aren't any specially named vector
885 // registers.
886 return RegName[0] == 'v' || RegName[0] == 'a';
887}
888
889/// \returns true if the result of the value could potentially be
890/// different across workitems in a wavefront.
892 if (const Argument *A = dyn_cast<Argument>(V))
894
895 // Loads from the private and flat address spaces are divergent, because
896 // threads can execute the load instruction with the same inputs and get
897 // different results.
898 //
899 // All other loads are not divergent, because if threads issue loads with the
900 // same arguments, they will always get the same result.
901 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
902 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
903 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
904
905 // Atomics are divergent because they are executed sequentially: when an
906 // atomic operation refers to the same address in each thread, then each
907 // thread after the first sees the value written by the previous thread as
908 // original value.
909 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
910 return true;
911
912 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
913 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
914 return isReadRegisterSourceOfDivergence(Intrinsic);
915
916 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
917 }
918
919 // Assume all function calls are a source of divergence.
920 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
921 if (CI->isInlineAsm())
923 return true;
924 }
925
926 // Assume all function calls are a source of divergence.
927 if (isa<InvokeInst>(V))
928 return true;
929
930 return false;
931}
932
933bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
934 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
935 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
936
937 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
938 if (CI->isInlineAsm())
940 return false;
941 }
942
943 // In most cases TID / wavefrontsize is uniform.
944 //
945 // However, if a kernel has uneven dimesions we can have a value of
946 // workitem-id-x divided by the wavefrontsize non-uniform. For example
947 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
948 // packed into a same wave which gives 1 and 0 after the division by 64
949 // respectively.
950 //
951 // FIXME: limit it to 1D kernels only, although that shall be possible
952 // to perform this optimization is the size of the X dimension is a power
953 // of 2, we just do not currently have infrastructure to query it.
954 using namespace llvm::PatternMatch;
955 uint64_t C;
956 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
957 m_ConstantInt(C))) ||
958 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
959 m_ConstantInt(C)))) {
960 const Function *F = cast<Instruction>(V)->getFunction();
961 return C >= ST->getWavefrontSizeLog2() &&
962 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
963 }
964
965 Value *Mask;
966 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
967 m_Value(Mask)))) {
968 const Function *F = cast<Instruction>(V)->getFunction();
969 const DataLayout &DL = F->getParent()->getDataLayout();
970 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
971 ST->getWavefrontSizeLog2() &&
972 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
973 }
974
975 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
976 if (!ExtValue)
977 return false;
978
979 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
980 if (!CI)
981 return false;
982
983 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
984 switch (Intrinsic->getIntrinsicID()) {
985 default:
986 return false;
987 case Intrinsic::amdgcn_if:
988 case Intrinsic::amdgcn_else: {
989 ArrayRef<unsigned> Indices = ExtValue->getIndices();
990 return Indices.size() == 1 && Indices[0] == 1;
991 }
992 }
993 }
994
995 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
996 // divergent for the overall struct return. We need to override it in the
997 // case we're extracting an SGPR component here.
998 if (CI->isInlineAsm())
999 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1000
1001 return false;
1002}
1003
1005 Intrinsic::ID IID) const {
1006 switch (IID) {
1007 case Intrinsic::amdgcn_atomic_inc:
1008 case Intrinsic::amdgcn_atomic_dec:
1009 case Intrinsic::amdgcn_ds_fadd:
1010 case Intrinsic::amdgcn_ds_fmin:
1011 case Intrinsic::amdgcn_ds_fmax:
1012 case Intrinsic::amdgcn_is_shared:
1013 case Intrinsic::amdgcn_is_private:
1014 case Intrinsic::amdgcn_flat_atomic_fadd:
1015 case Intrinsic::amdgcn_flat_atomic_fmax:
1016 case Intrinsic::amdgcn_flat_atomic_fmin:
1017 OpIndexes.push_back(0);
1018 return true;
1019 default:
1020 return false;
1021 }
1022}
1023
1025 Value *OldV,
1026 Value *NewV) const {
1027 auto IntrID = II->getIntrinsicID();
1028 switch (IntrID) {
1029 case Intrinsic::amdgcn_atomic_inc:
1030 case Intrinsic::amdgcn_atomic_dec:
1031 case Intrinsic::amdgcn_ds_fadd:
1032 case Intrinsic::amdgcn_ds_fmin:
1033 case Intrinsic::amdgcn_ds_fmax: {
1034 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1035 if (!IsVolatile->isZero())
1036 return nullptr;
1037 Module *M = II->getParent()->getParent()->getParent();
1038 Type *DestTy = II->getType();
1039 Type *SrcTy = NewV->getType();
1040 Function *NewDecl =
1041 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1042 II->setArgOperand(0, NewV);
1043 II->setCalledFunction(NewDecl);
1044 return II;
1045 }
1046 case Intrinsic::amdgcn_is_shared:
1047 case Intrinsic::amdgcn_is_private: {
1048 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1050 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1051 LLVMContext &Ctx = NewV->getType()->getContext();
1052 ConstantInt *NewVal = (TrueAS == NewAS) ?
1054 return NewVal;
1055 }
1056 case Intrinsic::ptrmask: {
1057 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1058 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1059 Value *MaskOp = II->getArgOperand(1);
1060 Type *MaskTy = MaskOp->getType();
1061
1062 bool DoTruncate = false;
1063
1064 const GCNTargetMachine &TM =
1065 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1066 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1067 // All valid 64-bit to 32-bit casts work by chopping off the high
1068 // bits. Any masking only clearing the low bits will also apply in the new
1069 // address space.
1070 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1071 DL.getPointerSizeInBits(NewAS) != 32)
1072 return nullptr;
1073
1074 // TODO: Do we need to thread more context in here?
1075 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1076 if (Known.countMinLeadingOnes() < 32)
1077 return nullptr;
1078
1079 DoTruncate = true;
1080 }
1081
1082 IRBuilder<> B(II);
1083 if (DoTruncate) {
1084 MaskTy = B.getInt32Ty();
1085 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1086 }
1087
1088 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1089 {NewV, MaskOp});
1090 }
1091 case Intrinsic::amdgcn_flat_atomic_fadd:
1092 case Intrinsic::amdgcn_flat_atomic_fmax:
1093 case Intrinsic::amdgcn_flat_atomic_fmin: {
1094 Module *M = II->getParent()->getParent()->getParent();
1095 Type *DestTy = II->getType();
1096 Type *SrcTy = NewV->getType();
1098 {DestTy, SrcTy, DestTy});
1099 II->setArgOperand(0, NewV);
1100 II->setCalledFunction(NewDecl);
1101 return II;
1102 }
1103 default:
1104 return nullptr;
1105 }
1106}
1107
1109 VectorType *VT, ArrayRef<int> Mask,
1111 int Index, VectorType *SubTp,
1113 Kind = improveShuffleKindFromMask(Kind, Mask);
1114 if (ST->hasVOP3PInsts()) {
1115 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1116 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1117 // With op_sel VOP3P instructions freely can access the low half or high
1118 // half of a register, so any swizzle is free.
1119
1120 switch (Kind) {
1121 case TTI::SK_Broadcast:
1122 case TTI::SK_Reverse:
1124 return 0;
1125 default:
1126 break;
1127 }
1128 }
1129 }
1130
1131 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1132}
1133
1135 const Function *Callee) const {
1136 const TargetMachine &TM = getTLI()->getTargetMachine();
1137 const GCNSubtarget *CallerST
1138 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1139 const GCNSubtarget *CalleeST
1140 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1141
1142 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1143 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1144
1145 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1146 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1147 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1148 return false;
1149
1150 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1151 // no way to support merge for backend defined attributes.
1152 SIModeRegisterDefaults CallerMode(*Caller);
1153 SIModeRegisterDefaults CalleeMode(*Callee);
1154 if (!CallerMode.isInlineCompatible(CalleeMode))
1155 return false;
1156
1157 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1158 Callee->hasFnAttribute(Attribute::InlineHint))
1159 return true;
1160
1161 // Hack to make compile times reasonable.
1162 if (InlineMaxBB) {
1163 // Single BB does not increase total BB amount.
1164 if (Callee->size() == 1)
1165 return true;
1166 size_t BBSize = Caller->size() + Callee->size() - 1;
1167 return BBSize <= InlineMaxBB;
1168 }
1169
1170 return true;
1171}
1172
1174 const SITargetLowering *TLI,
1175 const GCNTTIImpl *TTIImpl) {
1176 const int NrOfSGPRUntilSpill = 26;
1177 const int NrOfVGPRUntilSpill = 32;
1178
1179 const DataLayout &DL = TTIImpl->getDataLayout();
1180
1181 unsigned adjustThreshold = 0;
1182 int SGPRsInUse = 0;
1183 int VGPRsInUse = 0;
1184 for (const Use &A : CB->args()) {
1185 SmallVector<EVT, 4> ValueVTs;
1186 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1187 for (auto ArgVT : ValueVTs) {
1188 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1189 CB->getContext(), CB->getCallingConv(), ArgVT);
1191 SGPRsInUse += CCRegNum;
1192 else
1193 VGPRsInUse += CCRegNum;
1194 }
1195 }
1196
1197 // The cost of passing function arguments through the stack:
1198 // 1 instruction to put a function argument on the stack in the caller.
1199 // 1 instruction to take a function argument from the stack in callee.
1200 // 1 instruction is explicitly take care of data dependencies in callee
1201 // function.
1202 InstructionCost ArgStackCost(1);
1203 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1204 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1206 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1207 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1209
1210 // The penalty cost is computed relative to the cost of instructions and does
1211 // not model any storage costs.
1212 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1213 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1214 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1215 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1216 return adjustThreshold;
1217}
1218
1220 // If we have a pointer to private array passed into a function
1221 // it will not be optimized out, leaving scratch usage.
1222 // Increase the inline threshold to allow inlining in this case.
1223 unsigned adjustThreshold = 0;
1224 uint64_t AllocaSize = 0;
1226 for (Value *PtrArg : CB->args()) {
1227 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1228 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1230 continue;
1231
1232 PtrArg = getUnderlyingObject(PtrArg);
1233 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1234 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1235 continue;
1236 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1237 // If the amount of stack memory is excessive we will not be able
1238 // to get rid of the scratch anyway, bail out.
1239 if (AllocaSize > ArgAllocaCutoff) {
1240 AllocaSize = 0;
1241 break;
1242 }
1243 }
1244 }
1245 adjustThreshold +=
1247 adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
1248 return adjustThreshold;
1249}
1250
1254 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1255}
1256
1259 CommonTTI.getPeelingPreferences(L, SE, PP);
1260}
1261
1262int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1263 return ST->hasFullRate64Ops()
1264 ? getFullRateInstrCost()
1265 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1266 : getQuarterRateInstrCost(CostKind);
1267}
1268
1269std::pair<InstructionCost, MVT>
1270GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1271 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1272 auto Size = DL.getTypeSizeInBits(Ty);
1273 // Maximum load or store can handle 8 dwords for scalar and 4 for
1274 // vector ALU. Let's assume anything above 8 dwords is expensive
1275 // even if legal.
1276 if (Size <= 256)
1277 return Cost;
1278
1279 Cost.first += (Size + 255) / 256;
1280 return Cost;
1281}
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Callee
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
static const Function * getParent(const Value *V)
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
const char LLVMTargetMachineRef TM
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasVOP3PInsts() const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
Definition: Instructions.h:58
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:118
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:850
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:964
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:814
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:928
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1186
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1476
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1467
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1358
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1344
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1384
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1447
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:840
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:151
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:406
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:500
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:669
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:432
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:235
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:544
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:899
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:334
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:326
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:297
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2558
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
const BasicBlock * getParent() const
Definition: Instruction.h:90
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
Metadata node.
Definition: Metadata.h:943
Machine Value Type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:573
Root of the metadata hierarchy.
Definition: Metadata.h:61
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
The optimization diagnostic interface.
Class to represent pointers.
Definition: DerivedTypes.h:632
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
TargetOptions Options
const DataLayout & getDataLayout() const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:304
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
Type * getElementType() const
Definition: DerivedTypes.h:422
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:384
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:378
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:381
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:380
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:376
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:377
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:386
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:382
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:923
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:704
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:679
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1506
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:818
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:406
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1043
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1833
AtomicOrdering
Atomic ordering for LLVM's memory model.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
@ FAdd
Sum of floats.
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:439
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:121
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:242
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...