LLVM 19.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92 return true;
93 }
94 return false;
95}
96
98 : BaseT(TM, F.getParent()->getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
106 const Function &F = *L->getHeader()->getParent();
107 UP.Threshold =
108 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109 UP.MaxCount = std::numeric_limits<unsigned>::max();
110 UP.Partial = true;
111
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
114 UP.BEInsns += 3;
115
116 // We want to run unroll even for the loops which have been vectorized.
117 UP.UnrollVectorizedLoop = true;
118
119 // TODO: Do we want runtime unrolling?
120
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca = (256 - 16) * 4;
123 unsigned ThresholdPrivate = UnrollThresholdPrivate;
124 unsigned ThresholdLocal = UnrollThresholdLocal;
125
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode *LoopUnrollThreshold =
129 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold->getNumOperands() == 2) {
131 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132 LoopUnrollThreshold->getOperand(1));
133 if (MetaThresholdValue) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
136 // future.
137 UP.Threshold = MetaThresholdValue->getSExtValue();
139 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
140 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
141 }
142 }
143 }
144
145 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
146 for (const BasicBlock *BB : L->getBlocks()) {
147 const DataLayout &DL = BB->getModule()->getDataLayout();
148 unsigned LocalGEPsSeen = 0;
149
150 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
151 return SubLoop->contains(BB); }))
152 continue; // Block belongs to an inner loop.
153
154 for (const Instruction &I : *BB) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
161 if (UP.Threshold < MaxBoost && Br->isConditional()) {
162 BasicBlock *Succ0 = Br->getSuccessor(0);
163 BasicBlock *Succ1 = Br->getSuccessor(1);
164 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
165 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
166 continue;
167 if (dependsOnLocalPhi(L, Br->getCondition())) {
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170 << " for loop:\n"
171 << *L << " due to " << *Br << '\n');
172 if (UP.Threshold >= MaxBoost)
173 return;
174 }
175 }
176 continue;
177 }
178
179 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
180 if (!GEP)
181 continue;
182
183 unsigned AS = GEP->getAddressSpace();
184 unsigned Threshold = 0;
186 Threshold = ThresholdPrivate;
188 Threshold = ThresholdLocal;
189 else
190 continue;
191
192 if (UP.Threshold >= Threshold)
193 continue;
194
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196 const Value *Ptr = GEP->getPointerOperand();
197 const AllocaInst *Alloca =
198 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
199 if (!Alloca || !Alloca->isStaticAlloca())
200 continue;
201 Type *Ty = Alloca->getAllocatedType();
202 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203 if (AllocaSize > MaxAlloca)
204 continue;
205 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
207 LocalGEPsSeen++;
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
214 !isa<Argument>(GEP->getPointerOperand())))
215 continue;
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L << " due to LDS use.\n");
219 }
220
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef = false;
223 for (const Value *Op : GEP->operands()) {
224 const Instruction *Inst = dyn_cast<Instruction>(Op);
225 if (!Inst || L->isLoopInvariant(Op))
226 continue;
227
228 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
229 return SubLoop->contains(Inst); }))
230 continue;
231 HasLoopDef = true;
232 break;
233 }
234 if (!HasLoopDef)
235 continue;
236
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
243 // allocas.
244 //
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
247 //
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP.Threshold = Threshold;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252 << " for loop:\n"
253 << *L << " due to " << *GEP << '\n');
254 if (UP.Threshold >= MaxBoost)
255 return;
256 }
257
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
262 }
263}
264
268}
269
271 return 1024;
272}
273
274const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279 AMDGPU::FeatureUnalignedAccessMode,
280
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285 AMDGPU::FeatureTrapHandler,
286
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC,
290
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
295 : BaseT(TM, F.getParent()->getDataLayout()),
296 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
299 SIModeRegisterDefaults Mode(F, *ST);
300 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals =
302 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303}
304
306 return !F || !ST->isSingleLaneExecution(*F);
307}
308
309unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
314
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
317 return 4;
318}
319
322 switch (K) {
324 return TypeSize::getFixed(32);
326 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
328 return TypeSize::getScalable(0);
329 }
330 llvm_unreachable("Unsupported register kind");
331}
332
334 return 32;
335}
336
337unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339 return 32 * 4 / ElemWidth;
340 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342 : 1;
343}
344
345unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346 unsigned ChainSizeInBytes,
347 VectorType *VecTy) const {
348 unsigned VecRegBitWidth = VF * LoadSize;
349 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize;
352
353 return VF;
354}
355
356unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357 unsigned ChainSizeInBytes,
358 VectorType *VecTy) const {
359 unsigned VecRegBitWidth = VF * StoreSize;
360 if (VecRegBitWidth > 128)
361 return 128 / StoreSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
370 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
373 return 512;
374 }
375
376 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377 return 8 * ST->getMaxPrivateElementSize();
378
379 // Common to flat, global, local and region. Assume for unknown addrspace.
380 return 128;
381}
382
383bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384 Align Alignment,
385 unsigned AddrSpace) const {
386 // We allow vectorization of flat stores, even though we may need to decompose
387 // them later if they may access private memory. We don't have enough context
388 // here, and legalization can handle it.
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392 }
393 return true;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400}
401
402bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403 Align Alignment,
404 unsigned AddrSpace) const {
405 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406}
407
409 return 1024;
410}
411
412// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413// iteration. Should we report a larger size and let it legalize?
414//
415// FIXME: Should we use narrower types for local/region, or account for when
416// unaligned access is legal?
417//
418// FIXME: This could use fine tuning and microbenchmarks.
420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422 std::optional<uint32_t> AtomicElementSize) const {
423
424 if (AtomicElementSize)
425 return Type::getIntNTy(Context, *AtomicElementSize * 8);
426
427 unsigned MinAlign = std::min(SrcAlign, DestAlign);
428
429 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430 // hardware into byte accesses. If you assume all alignments are equally
431 // probable, it's more efficient on average to use short accesses for this
432 // case.
433 if (MinAlign == 2)
435
436 // Not all subtargets have 128-bit DS instructions, and we currently don't
437 // form them by default.
438 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
439 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
440 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
443 }
444
445 // Global memory works best with 16-byte accesses. Private memory will also
446 // hit this, although they'll be decomposed.
448}
449
451 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453 unsigned SrcAlign, unsigned DestAlign,
454 std::optional<uint32_t> AtomicCpySize) const {
455 assert(RemainingBytes < 16);
456
457 if (AtomicCpySize)
459 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460 DestAlign, AtomicCpySize);
461
462 unsigned MinAlign = std::min(SrcAlign, DestAlign);
463
464 if (MinAlign != 2) {
465 Type *I64Ty = Type::getInt64Ty(Context);
466 while (RemainingBytes >= 8) {
467 OpsOut.push_back(I64Ty);
468 RemainingBytes -= 8;
469 }
470
471 Type *I32Ty = Type::getInt32Ty(Context);
472 while (RemainingBytes >= 4) {
473 OpsOut.push_back(I32Ty);
474 RemainingBytes -= 4;
475 }
476 }
477
478 Type *I16Ty = Type::getInt16Ty(Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(I16Ty);
481 RemainingBytes -= 2;
482 }
483
485 while (RemainingBytes) {
486 OpsOut.push_back(I8Ty);
487 --RemainingBytes;
488 }
489}
490
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498}
499
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap:
505 case Intrinsic::amdgcn_ds_fadd:
506 case Intrinsic::amdgcn_ds_fmin:
507 case Intrinsic::amdgcn_ds_fmax: {
508 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527}
528
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
533 const Instruction *CxtI) {
534
535 // Legalize the type.
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537 int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539 // Because we don't have any legal vector operations, but the legal types, we
540 // need to account for split vectors.
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
543
544 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546 switch (ISD) {
547 case ISD::SHL:
548 case ISD::SRL:
549 case ISD::SRA:
550 if (SLT == MVT::i64)
551 return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553 if (ST->has16BitInsts() && SLT == MVT::i16)
554 NElts = (NElts + 1) / 2;
555
556 // i32
557 return getFullRateInstrCost() * LT.first * NElts;
558 case ISD::ADD:
559 case ISD::SUB:
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
597
598 // Estimate all types may be fused with contract/unsafe flags
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 Options.UnsafeFPMath ||
602 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
604 }
605 }
606 [[fallthrough]];
607 case ISD::FADD:
608 case ISD::FSUB:
609 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
656 // Fast unsafe fdiv lowering:
657 // f32 rcp
658 // f32 fmul
659 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 || SLT == MVT::f16) {
664 // 4 more v_cvt_* insts without f16 insts support
665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666 1 * getQuarterRateInstrCost(CostKind);
667
668 if (!HasFP32Denormals) {
669 // FP mode switches.
670 Cost += 2 * getFullRateInstrCost();
671 }
672
673 return LT.first * NElts * Cost;
674 }
675 break;
676 case ISD::FNEG:
677 // Use the backend' estimation. If fneg is not free each element will cost
678 // one additional instruction.
679 return TLI->isFNegFree(SLT) ? 0 : NElts;
680 default:
681 break;
682 }
683
684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
685 Args, CxtI);
686}
687
688// Return true if there's a potential benefit from using v2f16/v2i16
689// instructions for an intrinsic, even if it requires nontrivial legalization.
691 switch (ID) {
692 case Intrinsic::fma: // TODO: fmuladd
693 // There's a small benefit to using vector ops in the legalized code.
694 case Intrinsic::round:
695 case Intrinsic::uadd_sat:
696 case Intrinsic::usub_sat:
697 case Intrinsic::sadd_sat:
698 case Intrinsic::ssub_sat:
699 return true;
700 default:
701 return false;
702 }
703}
704
708 if (ICA.getID() == Intrinsic::fabs)
709 return 0;
710
713
714 Type *RetTy = ICA.getReturnType();
715
716 // Legalize the type.
717 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
718
719 unsigned NElts = LT.second.isVector() ?
720 LT.second.getVectorNumElements() : 1;
721
722 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
723
724 if (SLT == MVT::f64)
725 return LT.first * NElts * get64BitInstrCost(CostKind);
726
727 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
728 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
729 NElts = (NElts + 1) / 2;
730
731 // TODO: Get more refined intrinsic costs?
732 unsigned InstRate = getQuarterRateInstrCost(CostKind);
733
734 switch (ICA.getID()) {
735 case Intrinsic::fma:
736 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
737 : getQuarterRateInstrCost(CostKind);
738 break;
739 case Intrinsic::uadd_sat:
740 case Intrinsic::usub_sat:
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
744 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
745 NElts = 1;
746 break;
747 }
748
749 return LT.first * NElts * InstRate;
750}
751
754 const Instruction *I) {
755 assert((I == nullptr || I->getOpcode() == Opcode) &&
756 "Opcode should reflect passed instruction.");
757 const bool SCost =
759 const int CBrCost = SCost ? 5 : 7;
760 switch (Opcode) {
761 case Instruction::Br: {
762 // Branch instruction takes about 4 slots on gfx900.
763 auto BI = dyn_cast_or_null<BranchInst>(I);
764 if (BI && BI->isUnconditional())
765 return SCost ? 1 : 4;
766 // Suppose conditional branch takes additional 3 exec manipulations
767 // instructions in average.
768 return CBrCost;
769 }
770 case Instruction::Switch: {
771 auto SI = dyn_cast_or_null<SwitchInst>(I);
772 // Each case (including default) takes 1 cmp + 1 cbr instructions in
773 // average.
774 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
775 }
776 case Instruction::Ret:
777 return SCost ? 1 : 10;
778 }
779 return BaseT::getCFInstrCost(Opcode, CostKind, I);
780}
781
784 std::optional<FastMathFlags> FMF,
787 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
788
789 EVT OrigTy = TLI->getValueType(DL, Ty);
790
791 // Computes cost on targets that have packed math instructions(which support
792 // 16-bit types only).
793 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
794 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
795
796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
797 return LT.first * getFullRateInstrCost();
798}
799
802 FastMathFlags FMF,
804 EVT OrigTy = TLI->getValueType(DL, Ty);
805
806 // Computes cost on targets that have packed math instructions(which support
807 // 16-bit types only).
808 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
809 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
810
811 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
812 return LT.first * getHalfRateInstrCost(CostKind);
813}
814
817 unsigned Index, Value *Op0,
818 Value *Op1) {
819 switch (Opcode) {
820 case Instruction::ExtractElement:
821 case Instruction::InsertElement: {
822 unsigned EltSize
823 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
824 if (EltSize < 32) {
825 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
826 return 0;
827 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
828 Op1);
829 }
830
831 // Extracts are just reads of a subregister, so are free. Inserts are
832 // considered free because we don't want to have any cost for scalarizing
833 // operations, and we don't have to copy into a different register class.
834
835 // Dynamic indexing isn't free and is best avoided.
836 return Index == ~0u ? 2 : 0;
837 }
838 default:
839 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
840 }
841}
842
843/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
844/// this is analyzing the collective result of all output registers. Otherwise,
845/// this is only querying a specific result index if this returns multiple
846/// registers in a struct.
848 const CallInst *CI, ArrayRef<unsigned> Indices) const {
849 // TODO: Handle complex extract indices
850 if (Indices.size() > 1)
851 return true;
852
853 const DataLayout &DL = CI->getModule()->getDataLayout();
854 const SIRegisterInfo *TRI = ST->getRegisterInfo();
855 TargetLowering::AsmOperandInfoVector TargetConstraints =
856 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
857
858 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
859
860 int OutputIdx = 0;
861 for (auto &TC : TargetConstraints) {
862 if (TC.Type != InlineAsm::isOutput)
863 continue;
864
865 // Skip outputs we don't care about.
866 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
867 continue;
868
870
872 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
873
874 // For AGPR constraints null is returned on subtargets without AGPRs, so
875 // assume divergent for null.
876 if (!RC || !TRI->isSGPRClass(RC))
877 return true;
878 }
879
880 return false;
881}
882
884 const IntrinsicInst *ReadReg) const {
885 Metadata *MD =
886 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
888 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
889
890 // Special case registers that look like VCC.
891 MVT VT = MVT::getVT(ReadReg->getType());
892 if (VT == MVT::i1)
893 return true;
894
895 // Special case scalar registers that start with 'v'.
896 if (RegName.starts_with("vcc") || RegName.empty())
897 return false;
898
899 // VGPR or AGPR is divergent. There aren't any specially named vector
900 // registers.
901 return RegName[0] == 'v' || RegName[0] == 'a';
902}
903
904/// \returns true if the result of the value could potentially be
905/// different across workitems in a wavefront.
907 if (const Argument *A = dyn_cast<Argument>(V))
909
910 // Loads from the private and flat address spaces are divergent, because
911 // threads can execute the load instruction with the same inputs and get
912 // different results.
913 //
914 // All other loads are not divergent, because if threads issue loads with the
915 // same arguments, they will always get the same result.
916 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
917 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
918 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
919
920 // Atomics are divergent because they are executed sequentially: when an
921 // atomic operation refers to the same address in each thread, then each
922 // thread after the first sees the value written by the previous thread as
923 // original value.
924 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
925 return true;
926
927 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
928 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
929 return isReadRegisterSourceOfDivergence(Intrinsic);
930
931 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
932 }
933
934 // Assume all function calls are a source of divergence.
935 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
936 if (CI->isInlineAsm())
938 return true;
939 }
940
941 // Assume all function calls are a source of divergence.
942 if (isa<InvokeInst>(V))
943 return true;
944
945 return false;
946}
947
948bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
949 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
950 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
951
952 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
953 if (CI->isInlineAsm())
955 return false;
956 }
957
958 // In most cases TID / wavefrontsize is uniform.
959 //
960 // However, if a kernel has uneven dimesions we can have a value of
961 // workitem-id-x divided by the wavefrontsize non-uniform. For example
962 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
963 // packed into a same wave which gives 1 and 0 after the division by 64
964 // respectively.
965 //
966 // FIXME: limit it to 1D kernels only, although that shall be possible
967 // to perform this optimization is the size of the X dimension is a power
968 // of 2, we just do not currently have infrastructure to query it.
969 using namespace llvm::PatternMatch;
970 uint64_t C;
971 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972 m_ConstantInt(C))) ||
973 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
974 m_ConstantInt(C)))) {
975 const Function *F = cast<Instruction>(V)->getFunction();
976 return C >= ST->getWavefrontSizeLog2() &&
977 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
978 }
979
980 Value *Mask;
981 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
982 m_Value(Mask)))) {
983 const Function *F = cast<Instruction>(V)->getFunction();
984 const DataLayout &DL = F->getParent()->getDataLayout();
985 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
986 ST->getWavefrontSizeLog2() &&
987 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
988 }
989
990 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
991 if (!ExtValue)
992 return false;
993
994 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
995 if (!CI)
996 return false;
997
998 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
999 switch (Intrinsic->getIntrinsicID()) {
1000 default:
1001 return false;
1002 case Intrinsic::amdgcn_if:
1003 case Intrinsic::amdgcn_else: {
1004 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1005 return Indices.size() == 1 && Indices[0] == 1;
1006 }
1007 }
1008 }
1009
1010 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1011 // divergent for the overall struct return. We need to override it in the
1012 // case we're extracting an SGPR component here.
1013 if (CI->isInlineAsm())
1014 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1015
1016 return false;
1017}
1018
1020 Intrinsic::ID IID) const {
1021 switch (IID) {
1022 case Intrinsic::amdgcn_ds_fadd:
1023 case Intrinsic::amdgcn_ds_fmin:
1024 case Intrinsic::amdgcn_ds_fmax:
1025 case Intrinsic::amdgcn_is_shared:
1026 case Intrinsic::amdgcn_is_private:
1027 case Intrinsic::amdgcn_flat_atomic_fadd:
1028 case Intrinsic::amdgcn_flat_atomic_fmax:
1029 case Intrinsic::amdgcn_flat_atomic_fmin:
1030 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1031 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1032 OpIndexes.push_back(0);
1033 return true;
1034 default:
1035 return false;
1036 }
1037}
1038
1040 Value *OldV,
1041 Value *NewV) const {
1042 auto IntrID = II->getIntrinsicID();
1043 switch (IntrID) {
1044 case Intrinsic::amdgcn_ds_fadd:
1045 case Intrinsic::amdgcn_ds_fmin:
1046 case Intrinsic::amdgcn_ds_fmax: {
1047 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1048 if (!IsVolatile->isZero())
1049 return nullptr;
1050 Module *M = II->getParent()->getParent()->getParent();
1051 Type *DestTy = II->getType();
1052 Type *SrcTy = NewV->getType();
1053 Function *NewDecl =
1054 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1055 II->setArgOperand(0, NewV);
1056 II->setCalledFunction(NewDecl);
1057 return II;
1058 }
1059 case Intrinsic::amdgcn_is_shared:
1060 case Intrinsic::amdgcn_is_private: {
1061 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1063 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064 LLVMContext &Ctx = NewV->getType()->getContext();
1065 ConstantInt *NewVal = (TrueAS == NewAS) ?
1067 return NewVal;
1068 }
1069 case Intrinsic::ptrmask: {
1070 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072 Value *MaskOp = II->getArgOperand(1);
1073 Type *MaskTy = MaskOp->getType();
1074
1075 bool DoTruncate = false;
1076
1077 const GCNTargetMachine &TM =
1078 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1080 // All valid 64-bit to 32-bit casts work by chopping off the high
1081 // bits. Any masking only clearing the low bits will also apply in the new
1082 // address space.
1083 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1084 DL.getPointerSizeInBits(NewAS) != 32)
1085 return nullptr;
1086
1087 // TODO: Do we need to thread more context in here?
1088 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1089 if (Known.countMinLeadingOnes() < 32)
1090 return nullptr;
1091
1092 DoTruncate = true;
1093 }
1094
1095 IRBuilder<> B(II);
1096 if (DoTruncate) {
1097 MaskTy = B.getInt32Ty();
1098 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1099 }
1100
1101 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102 {NewV, MaskOp});
1103 }
1104 case Intrinsic::amdgcn_flat_atomic_fadd:
1105 case Intrinsic::amdgcn_flat_atomic_fmax:
1106 case Intrinsic::amdgcn_flat_atomic_fmin:
1107 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1108 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1109 Type *DestTy = II->getType();
1110 Type *SrcTy = NewV->getType();
1111 unsigned NewAS = SrcTy->getPointerAddressSpace();
1113 return nullptr;
1114 Module *M = II->getModule();
1116 {DestTy, SrcTy, DestTy});
1117 II->setArgOperand(0, NewV);
1118 II->setCalledFunction(NewDecl);
1119 return II;
1120 }
1121 default:
1122 return nullptr;
1123 }
1124}
1125
1127 VectorType *VT, ArrayRef<int> Mask,
1129 int Index, VectorType *SubTp,
1131 const Instruction *CxtI) {
1132 if (!isa<FixedVectorType>(VT))
1133 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1134
1135 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1136
1137 // Larger vector widths may require additional instructions, but are
1138 // typically cheaper than scalarized versions.
1139 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1141 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1142 bool HasVOP3P = ST->hasVOP3PInsts();
1143 unsigned RequestedElts =
1144 count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1145 if (RequestedElts == 0)
1146 return 0;
1147 switch (Kind) {
1148 case TTI::SK_Broadcast:
1149 case TTI::SK_Reverse:
1151 // With op_sel VOP3P instructions freely can access the low half or high
1152 // half of a register, so any swizzle of two elements is free.
1153 if (HasVOP3P && NumVectorElts == 2)
1154 return 0;
1155 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1156 // SK_Broadcast just reuses the same mask
1157 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1158 return NumPerms + NumPermMasks;
1159 }
1162 // Even aligned accesses are free
1163 if (!(Index % 2))
1164 return 0;
1165 // Insert/extract subvectors only require shifts / extract code to get the
1166 // relevant bits
1167 return alignTo(RequestedElts, 2) / 2;
1168 }
1170 case TTI::SK_Splice:
1171 case TTI::SK_Select: {
1172 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1173 // SK_Select just reuses the same mask
1174 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1175 return NumPerms + NumPermMasks;
1176 }
1177
1178 default:
1179 break;
1180 }
1181 }
1182
1183 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1184}
1185
1187 const Function *Callee) const {
1188 const TargetMachine &TM = getTLI()->getTargetMachine();
1189 const GCNSubtarget *CallerST
1190 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1191 const GCNSubtarget *CalleeST
1192 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1193
1194 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1195 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1196
1197 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1198 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1199 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1200 return false;
1201
1202 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1203 // no way to support merge for backend defined attributes.
1204 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1205 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1206 if (!CallerMode.isInlineCompatible(CalleeMode))
1207 return false;
1208
1209 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1210 Callee->hasFnAttribute(Attribute::InlineHint))
1211 return true;
1212
1213 // Hack to make compile times reasonable.
1214 if (InlineMaxBB) {
1215 // Single BB does not increase total BB amount.
1216 if (Callee->size() == 1)
1217 return true;
1218 size_t BBSize = Caller->size() + Callee->size() - 1;
1219 return BBSize <= InlineMaxBB;
1220 }
1221
1222 return true;
1223}
1224
1226 const SITargetLowering *TLI,
1227 const GCNTTIImpl *TTIImpl) {
1228 const int NrOfSGPRUntilSpill = 26;
1229 const int NrOfVGPRUntilSpill = 32;
1230
1231 const DataLayout &DL = TTIImpl->getDataLayout();
1232
1233 unsigned adjustThreshold = 0;
1234 int SGPRsInUse = 0;
1235 int VGPRsInUse = 0;
1236 for (const Use &A : CB->args()) {
1237 SmallVector<EVT, 4> ValueVTs;
1238 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1239 for (auto ArgVT : ValueVTs) {
1240 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1241 CB->getContext(), CB->getCallingConv(), ArgVT);
1243 SGPRsInUse += CCRegNum;
1244 else
1245 VGPRsInUse += CCRegNum;
1246 }
1247 }
1248
1249 // The cost of passing function arguments through the stack:
1250 // 1 instruction to put a function argument on the stack in the caller.
1251 // 1 instruction to take a function argument from the stack in callee.
1252 // 1 instruction is explicitly take care of data dependencies in callee
1253 // function.
1254 InstructionCost ArgStackCost(1);
1255 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1256 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1258 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1259 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1261
1262 // The penalty cost is computed relative to the cost of instructions and does
1263 // not model any storage costs.
1264 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1265 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1266 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1267 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1268 return adjustThreshold;
1269}
1270
1271static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1272 const DataLayout &DL) {
1273 // If we have a pointer to a private array passed into a function
1274 // it will not be optimized out, leaving scratch usage.
1275 // This function calculates the total size in bytes of the memory that would
1276 // end in scratch if the call was not inlined.
1277 unsigned AllocaSize = 0;
1279 for (Value *PtrArg : CB->args()) {
1280 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1281 if (!Ty)
1282 continue;
1283
1284 unsigned AddrSpace = Ty->getAddressSpace();
1285 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1286 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1287 continue;
1288
1289 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1290 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1291 continue;
1292
1293 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1294 }
1295 return AllocaSize;
1296}
1297
1299 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1300
1301 // Private object passed as arguments may end up in scratch usage if the call
1302 // is not inlined. Increase the inline threshold to promote inlining.
1303 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1304 if (AllocaSize > 0)
1305 Threshold += ArgAllocaCost;
1306 return Threshold;
1307}
1308
1310 const AllocaInst *AI) const {
1311
1312 // Below the cutoff, assume that the private memory objects would be
1313 // optimized
1314 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1315 if (AllocaSize <= ArgAllocaCutoff)
1316 return 0;
1317
1318 // Above the cutoff, we give a cost to each private memory object
1319 // depending its size. If the array can be optimized by SROA this cost is not
1320 // added to the total-cost in the inliner cost analysis.
1321 //
1322 // We choose the total cost of the alloca such that their sum cancels the
1323 // bonus given in the threshold (ArgAllocaCost).
1324 //
1325 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1326 //
1327 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1328 // the single-bb bonus and the vector-bonus.
1329 //
1330 // We compensate the first two multipliers, by repeating logic from the
1331 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1332 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1333 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1334
1335 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1336 return BB.getTerminator()->getNumSuccessors() > 1;
1337 });
1338 if (SingleBB) {
1339 Threshold += Threshold / 2;
1340 }
1341
1342 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1343
1344 // Attribute the bonus proportionally to the alloca size
1345 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1346
1347 return AllocaThresholdBonus;
1348}
1349
1353 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1354}
1355
1358 CommonTTI.getPeelingPreferences(L, SE, PP);
1359}
1360
1361int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1362 return ST->hasFullRate64Ops()
1363 ? getFullRateInstrCost()
1364 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1365 : getQuarterRateInstrCost(CostKind);
1366}
1367
1368std::pair<InstructionCost, MVT>
1369GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1370 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1371 auto Size = DL.getTypeSizeInBits(Ty);
1372 // Maximum load or store can handle 8 dwords for scalar and 4 for
1373 // vector ALU. Let's assume anything above 8 dwords is expensive
1374 // even if legal.
1375 if (Size <= 256)
1376 return Cost;
1377
1378 Cost.first += (Size + 255) / 256;
1379 return Cost;
1380}
1381
1383 return ST->hasPrefetch() ? 128 : 0;
1384}
1385
1388}
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
Definition: Instructions.h:59
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:125
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:971
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:656
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:893
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:857
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1809
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1800
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1692
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1718
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1781
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
bool hasPrefetch() const
Definition: GCNSubtarget.h:895
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:468
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:580
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:999
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:368
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:335
Generation getGeneration() const
Definition: GCNSubtarget.h:308
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
unsigned getNumberOfRegisters(unsigned RCID) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool hasBranchDivergence(const Function *F=nullptr) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2667
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Metadata node.
Definition: Metadata.h:1067
Machine Value Type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:587
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
The optimization diagnostic interface.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
const DataLayout & getDataLayout() const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
Type * getElementType() const
Definition: DerivedTypes.h:436
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:921
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:456
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1043
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ FAdd
Sum of floats.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:349
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:247
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...