LLVM 22.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(2700), cl::Hidden);
40
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(1000), cl::Hidden);
45
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(200), cl::Hidden);
50
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(true), cl::Hidden);
55
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering memcpy as a loop"),
84 cl::init(16), cl::Hidden);
85
86static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
87 unsigned Depth = 0) {
89 if (!I)
90 return false;
91
92 for (const Value *V : I->operand_values()) {
93 if (!L->contains(I))
94 continue;
95 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
96 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
97 return SubLoop->contains(PHI); }))
98 return true;
99 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
100 return true;
101 }
102 return false;
103}
104
106 : BaseT(TM, F.getDataLayout()),
107 TargetTriple(TM->getTargetTriple()),
108 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
109 TLI(ST->getTargetLowering()) {}
110
113 OptimizationRemarkEmitter *ORE) const {
114 const Function &F = *L->getHeader()->getParent();
115 UP.Threshold =
116 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
117 UP.MaxCount = std::numeric_limits<unsigned>::max();
118 UP.Partial = true;
119
120 // Conditional branch in a loop back edge needs 3 additional exec
121 // manipulations in average.
122 UP.BEInsns += 3;
123
124 // We want to run unroll even for the loops which have been vectorized.
125 UP.UnrollVectorizedLoop = true;
126
127 // TODO: Do we want runtime unrolling?
128
129 // Maximum alloca size than can fit registers. Reserve 16 registers.
130 const unsigned MaxAlloca = (256 - 16) * 4;
131 unsigned ThresholdPrivate = UnrollThresholdPrivate;
132 unsigned ThresholdLocal = UnrollThresholdLocal;
133
134 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
135 // provided threshold value as the default for Threshold
136 if (MDNode *LoopUnrollThreshold =
137 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
138 if (LoopUnrollThreshold->getNumOperands() == 2) {
140 LoopUnrollThreshold->getOperand(1));
141 if (MetaThresholdValue) {
142 // We will also use the supplied value for PartialThreshold for now.
143 // We may introduce additional metadata if it becomes necessary in the
144 // future.
145 UP.Threshold = MetaThresholdValue->getSExtValue();
147 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
148 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
149 }
150 }
151 }
152
153 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
154 for (const BasicBlock *BB : L->getBlocks()) {
155 const DataLayout &DL = BB->getDataLayout();
156 unsigned LocalGEPsSeen = 0;
157
158 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
159 return SubLoop->contains(BB); }))
160 continue; // Block belongs to an inner loop.
161
162 for (const Instruction &I : *BB) {
163 // Unroll a loop which contains an "if" statement whose condition
164 // defined by a PHI belonging to the loop. This may help to eliminate
165 // if region and potentially even PHI itself, saving on both divergence
166 // and registers used for the PHI.
167 // Add a small bonus for each of such "if" statements.
168 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
169 if (UP.Threshold < MaxBoost && Br->isConditional()) {
170 BasicBlock *Succ0 = Br->getSuccessor(0);
171 BasicBlock *Succ1 = Br->getSuccessor(1);
172 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
173 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
174 continue;
175 if (dependsOnLocalPhi(L, Br->getCondition())) {
177 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
178 << " for loop:\n"
179 << *L << " due to " << *Br << '\n');
180 if (UP.Threshold >= MaxBoost)
181 return;
182 }
183 }
184 continue;
185 }
186
188 if (!GEP)
189 continue;
190
191 unsigned AS = GEP->getAddressSpace();
192 unsigned Threshold = 0;
194 Threshold = ThresholdPrivate;
196 Threshold = ThresholdLocal;
197 else
198 continue;
199
200 if (UP.Threshold >= Threshold)
201 continue;
202
203 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
204 const Value *Ptr = GEP->getPointerOperand();
205 const AllocaInst *Alloca =
207 if (!Alloca || !Alloca->isStaticAlloca())
208 continue;
209 Type *Ty = Alloca->getAllocatedType();
210 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
211 if (AllocaSize > MaxAlloca)
212 continue;
213 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
215 LocalGEPsSeen++;
216 // Inhibit unroll for local memory if we have seen addressing not to
217 // a variable, most likely we will be unable to combine it.
218 // Do not unroll too deep inner loops for local memory to give a chance
219 // to unroll an outer loop for a more important reason.
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
221 continue;
222
223 const Value *V = getUnderlyingObject(GEP->getPointerOperand());
224 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
225 continue;
226
227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
228 << *L << " due to LDS use.\n");
230 }
231
232 // Check if GEP depends on a value defined by this loop itself.
233 bool HasLoopDef = false;
234 for (const Value *Op : GEP->operands()) {
235 const Instruction *Inst = dyn_cast<Instruction>(Op);
236 if (!Inst || L->isLoopInvariant(Op))
237 continue;
238
239 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
240 return SubLoop->contains(Inst); }))
241 continue;
242 HasLoopDef = true;
243 break;
244 }
245 if (!HasLoopDef)
246 continue;
247
248 // We want to do whatever we can to limit the number of alloca
249 // instructions that make it through to the code generator. allocas
250 // require us to use indirect addressing, which is slow and prone to
251 // compiler bugs. If this loop does an address calculation on an
252 // alloca ptr, then we want to use a higher than normal loop unroll
253 // threshold. This will give SROA a better chance to eliminate these
254 // allocas.
255 //
256 // We also want to have more unrolling for local memory to let ds
257 // instructions with different offsets combine.
258 //
259 // Don't use the maximum allowed value here as it will make some
260 // programs way too big.
261 UP.Threshold = Threshold;
262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
263 << " for loop:\n"
264 << *L << " due to " << *GEP << '\n');
265 if (UP.Threshold >= MaxBoost)
266 return;
267 }
268
269 // If we got a GEP in a small BB from inner loop then increase max trip
270 // count to analyze for better estimation cost in unroll
271 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
273 }
274}
275
280
284
285const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
286 // Codegen control options which don't matter.
287 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
288 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
289 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
290 AMDGPU::FeatureUnalignedAccessMode,
291
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
293
294 // Property of the kernel/environment which can't actually differ.
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureTrapHandler,
297
298 // The default assumption needs to be ecc is enabled, but no directly
299 // exposed operations depend on it, so it can be safely inlined.
300 AMDGPU::FeatureSRAMECC,
301
302 // Perf-tuning features
303 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
304
306 : BaseT(TM, F.getDataLayout()),
307 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
308 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
309 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
311 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
312 HasFP64FP16Denormals =
313 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
314}
315
317 return !F || !ST->isSingleLaneExecution(*F);
318}
319
320unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
321 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
322 // registers. See getRegisterClassForType for the implementation.
323 // In this case vector registers are not vector in terms of
324 // VGPRs, but those which can hold multiple values.
325
326 // This is really the number of registers to fill when vectorizing /
327 // interleaving loops, so we lie to avoid trying to use all registers.
328 return 4;
329}
330
333 switch (K) {
335 return TypeSize::getFixed(32);
337 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
339 return TypeSize::getScalable(0);
340 }
341 llvm_unreachable("Unsupported register kind");
342}
343
345 return 32;
346}
347
348unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
349 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
350 return 32 * 4 / ElemWidth;
351 // For a given width return the max 0number of elements that can be combined
352 // into a wider bit value:
353 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
354 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
355 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356 : 1;
357}
358
359unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
360 unsigned ChainSizeInBytes,
361 VectorType *VecTy) const {
362 unsigned VecRegBitWidth = VF * LoadSize;
363 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
364 // TODO: Support element-size less than 32bit?
365 return 128 / LoadSize;
366
367 return VF;
368}
369
370unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
371 unsigned ChainSizeInBytes,
372 VectorType *VecTy) const {
373 unsigned VecRegBitWidth = VF * StoreSize;
374 if (VecRegBitWidth > 128)
375 return 128 / StoreSize;
376
377 return VF;
378}
379
380unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
381 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
382 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
384 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
385 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
387 return 512;
388 }
389
390 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
391 return 8 * ST->getMaxPrivateElementSize();
392
393 // Common to flat, global, local and region. Assume for unknown addrspace.
394 return 128;
395}
396
397bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
398 Align Alignment,
399 unsigned AddrSpace) const {
400 // We allow vectorization of flat stores, even though we may need to decompose
401 // them later if they may access private memory. We don't have enough context
402 // here, and legalization can handle it.
403 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
404 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
405 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
406 }
407 return true;
408}
409
410bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
411 Align Alignment,
412 unsigned AddrSpace) const {
413 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
414}
415
416bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
417 Align Alignment,
418 unsigned AddrSpace) const {
419 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
420}
421
425
427 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
428 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
429 std::optional<uint32_t> AtomicElementSize) const {
430
431 if (AtomicElementSize)
432 return Type::getIntNTy(Context, *AtomicElementSize * 8);
433
434 // 16-byte accesses achieve the highest copy throughput.
435 // If the operation has a fixed known length that is large enough, it is
436 // worthwhile to return an even wider type and let legalization lower it into
437 // multiple accesses, effectively unrolling the memcpy loop.
438 // We also rely on legalization to decompose into smaller accesses for
439 // subtargets and address spaces where it is necessary.
440 //
441 // Don't unroll if Length is not a constant, since unrolling leads to worse
442 // performance for length values that are smaller or slightly larger than the
443 // total size of the type returned here. Mitigating that would require a more
444 // complex lowering for variable-length memcpy and memmove.
445 unsigned I32EltsInVector = 4;
448 MemcpyLoopUnroll * I32EltsInVector);
449
450 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
451}
452
454 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
455 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
456 Align SrcAlign, Align DestAlign,
457 std::optional<uint32_t> AtomicCpySize) const {
458
459 if (AtomicCpySize)
461 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
462 DestAlign, AtomicCpySize);
463
464 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
465 while (RemainingBytes >= 16) {
466 OpsOut.push_back(I32x4Ty);
467 RemainingBytes -= 16;
468 }
469
470 Type *I64Ty = Type::getInt64Ty(Context);
471 while (RemainingBytes >= 8) {
472 OpsOut.push_back(I64Ty);
473 RemainingBytes -= 8;
474 }
475
476 Type *I32Ty = Type::getInt32Ty(Context);
477 while (RemainingBytes >= 4) {
478 OpsOut.push_back(I32Ty);
479 RemainingBytes -= 4;
480 }
481
482 Type *I16Ty = Type::getInt16Ty(Context);
483 while (RemainingBytes >= 2) {
484 OpsOut.push_back(I16Ty);
485 RemainingBytes -= 2;
486 }
487
488 Type *I8Ty = Type::getInt8Ty(Context);
489 while (RemainingBytes) {
490 OpsOut.push_back(I8Ty);
491 --RemainingBytes;
492 }
493}
494
496 // Disable unrolling if the loop is not vectorized.
497 // TODO: Enable this again.
498 if (VF.isScalar())
499 return 1;
500
501 return 8;
502}
503
505 MemIntrinsicInfo &Info) const {
506 switch (Inst->getIntrinsicID()) {
507 case Intrinsic::amdgcn_ds_ordered_add:
508 case Intrinsic::amdgcn_ds_ordered_swap: {
509 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
510 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
511 if (!Ordering || !Volatile)
512 return false; // Invalid.
513
514 unsigned OrderingVal = Ordering->getZExtValue();
515 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
516 return false;
517
518 Info.PtrVal = Inst->getArgOperand(0);
519 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
520 Info.ReadMem = true;
521 Info.WriteMem = true;
522 Info.IsVolatile = !Volatile->isZero();
523 return true;
524 }
525 default:
526 return false;
527 }
528}
529
531 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
533 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
534
535 // Legalize the type.
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537 int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539 // Because we don't have any legal vector operations, but the legal types, we
540 // need to account for split vectors.
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
543
544 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546 switch (ISD) {
547 case ISD::SHL:
548 case ISD::SRL:
549 case ISD::SRA:
550 if (SLT == MVT::i64)
551 return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553 if (ST->has16BitInsts() && SLT == MVT::i16)
554 NElts = (NElts + 1) / 2;
555
556 // i32
557 return getFullRateInstrCost() * LT.first * NElts;
558 case ISD::ADD:
559 case ISD::SUB:
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
597
598 // Estimate all types may be fused with contract/unsafe flags
599 const TargetOptions &Options = TLI->getTargetMachine().Options;
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603 }
604 }
605 [[fallthrough]];
606 case ISD::FADD:
607 case ISD::FSUB:
608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609 NElts = (NElts + 1) / 2;
610 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
611 NElts = (NElts + 1) / 2;
612 if (SLT == MVT::f64)
613 return LT.first * NElts * get64BitInstrCost(CostKind);
614
615 if (ST->has16BitInsts() && SLT == MVT::f16)
616 NElts = (NElts + 1) / 2;
617
618 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
619 return LT.first * NElts * getFullRateInstrCost();
620 break;
621 case ISD::FDIV:
622 case ISD::FREM:
623 // FIXME: frem should be handled separately. The fdiv in it is most of it,
624 // but the current lowering is also not entirely correct.
625 if (SLT == MVT::f64) {
626 int Cost = 7 * get64BitInstrCost(CostKind) +
627 getQuarterRateInstrCost(CostKind) +
628 3 * getHalfRateInstrCost(CostKind);
629 // Add cost of workaround.
630 if (!ST->hasUsableDivScaleConditionOutput())
631 Cost += 3 * getFullRateInstrCost();
632
633 return LT.first * Cost * NElts;
634 }
635
636 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
637 // TODO: This is more complicated, unsafe flags etc.
638 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
639 (SLT == MVT::f16 && ST->has16BitInsts())) {
640 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
641 }
642 }
643
644 if (SLT == MVT::f16 && ST->has16BitInsts()) {
645 // 2 x v_cvt_f32_f16
646 // f32 rcp
647 // f32 fmul
648 // v_cvt_f16_f32
649 // f16 div_fixup
650 int Cost =
651 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
652 return LT.first * Cost * NElts;
653 }
654
655 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
656 // Fast unsafe fdiv lowering:
657 // f32 rcp
658 // f32 fmul
659 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 || SLT == MVT::f16) {
664 // 4 more v_cvt_* insts without f16 insts support
665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666 1 * getQuarterRateInstrCost(CostKind);
667
668 if (!HasFP32Denormals) {
669 // FP mode switches.
670 Cost += 2 * getFullRateInstrCost();
671 }
672
673 return LT.first * NElts * Cost;
674 }
675 break;
676 case ISD::FNEG:
677 // Use the backend' estimation. If fneg is not free each element will cost
678 // one additional instruction.
679 return TLI->isFNegFree(SLT) ? 0 : NElts;
680 default:
681 break;
682 }
683
684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
685 Args, CxtI);
686}
687
688// Return true if there's a potential benefit from using v2f16/v2i16
689// instructions for an intrinsic, even if it requires nontrivial legalization.
691 switch (ID) {
692 case Intrinsic::fma:
693 case Intrinsic::fmuladd:
694 case Intrinsic::copysign:
695 case Intrinsic::minimumnum:
696 case Intrinsic::maximumnum:
697 case Intrinsic::canonicalize:
698 // There's a small benefit to using vector ops in the legalized code.
699 case Intrinsic::round:
700 case Intrinsic::uadd_sat:
701 case Intrinsic::usub_sat:
702 case Intrinsic::sadd_sat:
703 case Intrinsic::ssub_sat:
704 case Intrinsic::abs:
705 return true;
706 default:
707 return false;
708 }
709}
710
714 switch (ICA.getID()) {
715 case Intrinsic::fabs:
716 // Free source modifier in the common case.
717 return 0;
718 case Intrinsic::amdgcn_workitem_id_x:
719 case Intrinsic::amdgcn_workitem_id_y:
720 case Intrinsic::amdgcn_workitem_id_z:
721 // TODO: If hasPackedTID, or if the calling context is not an entry point
722 // there may be a bit instruction.
723 return 0;
724 case Intrinsic::amdgcn_workgroup_id_x:
725 case Intrinsic::amdgcn_workgroup_id_y:
726 case Intrinsic::amdgcn_workgroup_id_z:
727 case Intrinsic::amdgcn_lds_kernel_id:
728 case Intrinsic::amdgcn_dispatch_ptr:
729 case Intrinsic::amdgcn_dispatch_id:
730 case Intrinsic::amdgcn_implicitarg_ptr:
731 case Intrinsic::amdgcn_queue_ptr:
732 // Read from an argument register.
733 return 0;
734 default:
735 break;
736 }
737
740
741 Type *RetTy = ICA.getReturnType();
742
743 // Legalize the type.
744 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
745
746 unsigned NElts = LT.second.isVector() ?
747 LT.second.getVectorNumElements() : 1;
748
749 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
750
751 if ((ST->hasVOP3PInsts() &&
752 (SLT == MVT::f16 || SLT == MVT::i16 ||
753 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
754 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
755 NElts = (NElts + 1) / 2;
756
757 // TODO: Get more refined intrinsic costs?
758 unsigned InstRate = getQuarterRateInstrCost(CostKind);
759
760 switch (ICA.getID()) {
761 case Intrinsic::fma:
762 case Intrinsic::fmuladd:
763 if (SLT == MVT::f64) {
764 InstRate = get64BitInstrCost(CostKind);
765 break;
766 }
767
768 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
769 InstRate = getFullRateInstrCost();
770 else {
771 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
772 : getQuarterRateInstrCost(CostKind);
773 }
774 break;
775 case Intrinsic::copysign:
776 return NElts * getFullRateInstrCost();
777 case Intrinsic::minimumnum:
778 case Intrinsic::maximumnum: {
779 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
780 // promotion takes the place of the canonicalize.
781 unsigned NumOps = 3;
782 if (const IntrinsicInst *II = ICA.getInst()) {
783 // Directly legal with ieee=0
784 // TODO: Not directly legal with strictfp
786 NumOps = 1;
787 }
788
789 unsigned BaseRate =
790 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
791 InstRate = BaseRate * NumOps;
792 break;
793 }
794 case Intrinsic::canonicalize: {
795 InstRate =
796 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
797 break;
798 }
799 case Intrinsic::uadd_sat:
800 case Intrinsic::usub_sat:
801 case Intrinsic::sadd_sat:
802 case Intrinsic::ssub_sat: {
803 if (SLT == MVT::i16 || SLT == MVT::i32)
804 InstRate = getFullRateInstrCost();
805
806 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
807 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
808 NElts = 1;
809 break;
810 }
811 case Intrinsic::abs:
812 // Expansion takes 2 instructions for VALU
813 if (SLT == MVT::i16 || SLT == MVT::i32)
814 InstRate = 2 * getFullRateInstrCost();
815 break;
816 default:
817 break;
818 }
819
820 return LT.first * NElts * InstRate;
821}
822
825 const Instruction *I) const {
826 assert((I == nullptr || I->getOpcode() == Opcode) &&
827 "Opcode should reflect passed instruction.");
828 const bool SCost =
830 const int CBrCost = SCost ? 5 : 7;
831 switch (Opcode) {
832 case Instruction::Br: {
833 // Branch instruction takes about 4 slots on gfx900.
834 const auto *BI = dyn_cast_or_null<BranchInst>(I);
835 if (BI && BI->isUnconditional())
836 return SCost ? 1 : 4;
837 // Suppose conditional branch takes additional 3 exec manipulations
838 // instructions in average.
839 return CBrCost;
840 }
841 case Instruction::Switch: {
842 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
843 // Each case (including default) takes 1 cmp + 1 cbr instructions in
844 // average.
845 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
846 }
847 case Instruction::Ret:
848 return SCost ? 1 : 10;
849 }
850 return BaseT::getCFInstrCost(Opcode, CostKind, I);
851}
852
855 std::optional<FastMathFlags> FMF,
858 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
859
860 EVT OrigTy = TLI->getValueType(DL, Ty);
861
862 // Computes cost on targets that have packed math instructions(which support
863 // 16-bit types only).
864 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
865 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
866
867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
868 return LT.first * getFullRateInstrCost();
869}
870
873 FastMathFlags FMF,
875 EVT OrigTy = TLI->getValueType(DL, Ty);
876
877 // Computes cost on targets that have packed math instructions(which support
878 // 16-bit types only).
879 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
880 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
881
882 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
883 return LT.first * getHalfRateInstrCost(CostKind);
884}
885
888 unsigned Index, const Value *Op0,
889 const Value *Op1) const {
890 switch (Opcode) {
891 case Instruction::ExtractElement:
892 case Instruction::InsertElement: {
893 unsigned EltSize
894 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
895 if (EltSize < 32) {
896 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
897 return 0;
898 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
899 Op1);
900 }
901
902 // Extracts are just reads of a subregister, so are free. Inserts are
903 // considered free because we don't want to have any cost for scalarizing
904 // operations, and we don't have to copy into a different register class.
905
906 // Dynamic indexing isn't free and is best avoided.
907 return Index == ~0u ? 2 : 0;
908 }
909 default:
910 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
911 }
912}
913
914/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
915/// this is analyzing the collective result of all output registers. Otherwise,
916/// this is only querying a specific result index if this returns multiple
917/// registers in a struct.
919 const CallInst *CI, ArrayRef<unsigned> Indices) const {
920 // TODO: Handle complex extract indices
921 if (Indices.size() > 1)
922 return true;
923
924 const DataLayout &DL = CI->getDataLayout();
925 const SIRegisterInfo *TRI = ST->getRegisterInfo();
926 TargetLowering::AsmOperandInfoVector TargetConstraints =
927 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
928
929 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
930
931 int OutputIdx = 0;
932 for (auto &TC : TargetConstraints) {
933 if (TC.Type != InlineAsm::isOutput)
934 continue;
935
936 // Skip outputs we don't care about.
937 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
938 continue;
939
940 TLI->ComputeConstraintToUse(TC, SDValue());
941
942 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
943 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
944
945 // For AGPR constraints null is returned on subtargets without AGPRs, so
946 // assume divergent for null.
947 if (!RC || !TRI->isSGPRClass(RC))
948 return true;
949 }
950
951 return false;
952}
953
955 const IntrinsicInst *ReadReg) const {
956 Metadata *MD =
957 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
959 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
960
961 // Special case registers that look like VCC.
962 MVT VT = MVT::getVT(ReadReg->getType());
963 if (VT == MVT::i1)
964 return true;
965
966 // Special case scalar registers that start with 'v'.
967 if (RegName.starts_with("vcc") || RegName.empty())
968 return false;
969
970 // VGPR or AGPR is divergent. There aren't any specially named vector
971 // registers.
972 return RegName[0] == 'v' || RegName[0] == 'a';
973}
974
975/// \returns true if the result of the value could potentially be
976/// different across workitems in a wavefront.
978 if (const Argument *A = dyn_cast<Argument>(V))
980
981 // Loads from the private and flat address spaces are divergent, because
982 // threads can execute the load instruction with the same inputs and get
983 // different results.
984 //
985 // All other loads are not divergent, because if threads issue loads with the
986 // same arguments, they will always get the same result.
987 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
988 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
989 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
990
991 // Atomics are divergent because they are executed sequentially: when an
992 // atomic operation refers to the same address in each thread, then each
993 // thread after the first sees the value written by the previous thread as
994 // original value.
996 return true;
997
999 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1000 switch (IID) {
1001 case Intrinsic::read_register:
1003 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1004 unsigned SrcAS =
1005 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1006 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1007 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1008 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1009 ST->hasGloballyAddressableScratch();
1010 }
1011 case Intrinsic::amdgcn_workitem_id_y:
1012 case Intrinsic::amdgcn_workitem_id_z: {
1013 const Function *F = Intrinsic->getFunction();
1014 bool HasUniformYZ =
1015 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1016 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1017 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1018 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1019 }
1020 default:
1022 }
1023 }
1024
1025 // Assume all function calls are a source of divergence.
1026 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1027 if (CI->isInlineAsm())
1029 return true;
1030 }
1031
1032 // Assume all function calls are a source of divergence.
1033 if (isa<InvokeInst>(V))
1034 return true;
1035
1036 // If the target supports globally addressable scratch, the mapping from
1037 // scratch memory to the flat aperture changes therefore an address space cast
1038 // is no longer uniform.
1039 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1040 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1041 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1042 ST->hasGloballyAddressableScratch();
1043 }
1044
1045 return false;
1046}
1047
1050 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1051
1052 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1053 if (CI->isInlineAsm())
1055 return false;
1056 }
1057
1058 // In most cases TID / wavefrontsize is uniform.
1059 //
1060 // However, if a kernel has uneven dimesions we can have a value of
1061 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1062 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1063 // packed into a same wave which gives 1 and 0 after the division by 64
1064 // respectively.
1065 //
1066 // The X dimension doesn't reset within a wave if either both the Y
1067 // and Z dimensions are of length 1, or if the X dimension's required
1068 // size is a power of 2. Note, however, if the X dimension's maximum
1069 // size is a power of 2 < the wavefront size, division by the wavefront
1070 // size is guaranteed to yield 0, so this is also a no-reset case.
1071 bool XDimDoesntResetWithinWaves = false;
1072 if (auto *I = dyn_cast<Instruction>(V)) {
1073 const Function *F = I->getFunction();
1074 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1075 }
1076 using namespace llvm::PatternMatch;
1077 uint64_t C;
1079 m_ConstantInt(C))) ||
1081 m_ConstantInt(C)))) {
1082 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1083 }
1084
1085 Value *Mask;
1087 m_Value(Mask)))) {
1088 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1089 ST->getWavefrontSizeLog2() &&
1090 XDimDoesntResetWithinWaves;
1091 }
1092
1093 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1094 if (!ExtValue)
1095 return false;
1096
1097 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1098 if (!CI)
1099 return false;
1100
1102 switch (Intrinsic->getIntrinsicID()) {
1103 default:
1104 return false;
1105 case Intrinsic::amdgcn_if:
1106 case Intrinsic::amdgcn_else: {
1107 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1108 return Indices.size() == 1 && Indices[0] == 1;
1109 }
1110 }
1111 }
1112
1113 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1114 // divergent for the overall struct return. We need to override it in the
1115 // case we're extracting an SGPR component here.
1116 if (CI->isInlineAsm())
1117 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1118
1119 return false;
1120}
1121
1123 Intrinsic::ID IID) const {
1124 switch (IID) {
1125 case Intrinsic::amdgcn_is_shared:
1126 case Intrinsic::amdgcn_is_private:
1127 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1128 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1129 case Intrinsic::amdgcn_load_to_lds:
1130 case Intrinsic::amdgcn_make_buffer_rsrc:
1131 OpIndexes.push_back(0);
1132 return true;
1133 default:
1134 return false;
1135 }
1136}
1137
1139 Value *OldV,
1140 Value *NewV) const {
1141 auto IntrID = II->getIntrinsicID();
1142 switch (IntrID) {
1143 case Intrinsic::amdgcn_is_shared:
1144 case Intrinsic::amdgcn_is_private: {
1145 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1147 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1148 LLVMContext &Ctx = NewV->getType()->getContext();
1149 ConstantInt *NewVal = (TrueAS == NewAS) ?
1151 return NewVal;
1152 }
1153 case Intrinsic::ptrmask: {
1154 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1155 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1156 Value *MaskOp = II->getArgOperand(1);
1157 Type *MaskTy = MaskOp->getType();
1158
1159 bool DoTruncate = false;
1160
1161 const GCNTargetMachine &TM =
1162 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1163 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1164 // All valid 64-bit to 32-bit casts work by chopping off the high
1165 // bits. Any masking only clearing the low bits will also apply in the new
1166 // address space.
1167 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1168 DL.getPointerSizeInBits(NewAS) != 32)
1169 return nullptr;
1170
1171 // TODO: Do we need to thread more context in here?
1172 KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);
1173 if (Known.countMinLeadingOnes() < 32)
1174 return nullptr;
1175
1176 DoTruncate = true;
1177 }
1178
1179 IRBuilder<> B(II);
1180 if (DoTruncate) {
1181 MaskTy = B.getInt32Ty();
1182 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1183 }
1184
1185 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1186 {NewV, MaskOp});
1187 }
1188 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1189 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1190 Type *DestTy = II->getType();
1191 Type *SrcTy = NewV->getType();
1192 unsigned NewAS = SrcTy->getPointerAddressSpace();
1194 return nullptr;
1195 Module *M = II->getModule();
1197 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1198 II->setArgOperand(0, NewV);
1199 II->setCalledFunction(NewDecl);
1200 return II;
1201 }
1202 case Intrinsic::amdgcn_load_to_lds: {
1203 Type *SrcTy = NewV->getType();
1204 Module *M = II->getModule();
1205 Function *NewDecl =
1206 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1207 II->setArgOperand(0, NewV);
1208 II->setCalledFunction(NewDecl);
1209 return II;
1210 }
1211 case Intrinsic::amdgcn_make_buffer_rsrc: {
1212 Type *SrcTy = NewV->getType();
1213 Type *DstTy = II->getType();
1214 Module *M = II->getModule();
1216 M, II->getIntrinsicID(), {DstTy, SrcTy});
1217 II->setArgOperand(0, NewV);
1218 II->setCalledFunction(NewDecl);
1219 return II;
1220 }
1221 default:
1222 return nullptr;
1223 }
1224}
1225
1227 VectorType *DstTy, VectorType *SrcTy,
1228 ArrayRef<int> Mask,
1230 int Index, VectorType *SubTp,
1232 const Instruction *CxtI) const {
1233 if (!isa<FixedVectorType>(SrcTy))
1234 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1235 SubTp);
1236
1237 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1238
1239 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1240 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1241 (ScalarSize == 16 || ScalarSize == 8)) {
1242 // Larger vector widths may require additional instructions, but are
1243 // typically cheaper than scalarized versions.
1244 //
1245 // We assume that shuffling at a register granularity can be done for free.
1246 // This is not true for vectors fed into memory instructions, but it is
1247 // effectively true for all other shuffling. The emphasis of the logic here
1248 // is to assist generic transform in cleaning up / canonicalizing those
1249 // shuffles.
1250
1251 // With op_sel VOP3P instructions freely can access the low half or high
1252 // half of a register, so any swizzle of two elements is free.
1253 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1254 unsigned NumSrcElts = SrcVecTy->getNumElements();
1255 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1256 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1257 Kind == TTI::SK_PermuteSingleSrc))
1258 return 0;
1259 }
1260
1261 unsigned EltsPerReg = 32 / ScalarSize;
1262 switch (Kind) {
1263 case TTI::SK_Broadcast:
1264 // A single v_perm_b32 can be re-used for all destination registers.
1265 return 1;
1266 case TTI::SK_Reverse:
1267 // One instruction per register.
1268 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1269 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1272 if (Index % EltsPerReg == 0)
1273 return 0; // Shuffling at register granularity
1274 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1275 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1278 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1279 if (!DstVecTy)
1281 unsigned NumDstElts = DstVecTy->getNumElements();
1282 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1283 unsigned EndIndex = Index + NumInsertElts;
1284 unsigned BeginSubIdx = Index % EltsPerReg;
1285 unsigned EndSubIdx = EndIndex % EltsPerReg;
1286 unsigned Cost = 0;
1287
1288 if (BeginSubIdx != 0) {
1289 // Need to shift the inserted vector into place. The cost is the number
1290 // of destination registers overlapped by the inserted vector.
1291 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1292 }
1293
1294 // If the last register overlap is partial, there may be three source
1295 // registers feeding into it; that takes an extra instruction.
1296 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1297 Cost += 1;
1298
1299 return Cost;
1300 }
1301 case TTI::SK_Splice: {
1302 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1303 if (!DstVecTy)
1305 unsigned NumElts = DstVecTy->getNumElements();
1306 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1307 // Determine the sub-region of the result vector that requires
1308 // sub-register shuffles / mixing.
1309 unsigned EltsFromLHS = NumElts - Index;
1310 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1311 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1312 if (LHSIsAligned && RHSIsAligned)
1313 return 0;
1314 if (LHSIsAligned && !RHSIsAligned)
1315 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1316 if (!LHSIsAligned && RHSIsAligned)
1317 return divideCeil(EltsFromLHS, EltsPerReg);
1318 return divideCeil(NumElts, EltsPerReg);
1319 }
1320 default:
1321 break;
1322 }
1323
1324 if (!Mask.empty()) {
1325 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1326
1327 // Generically estimate the cost by assuming that each destination
1328 // register is derived from sources via v_perm_b32 instructions if it
1329 // can't be copied as-is.
1330 //
1331 // For each destination register, derive the cost of obtaining it based
1332 // on the number of source registers that feed into it.
1333 unsigned Cost = 0;
1334 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1336 bool Aligned = true;
1337 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1338 int SrcIdx = Mask[DstIdx + I];
1339 if (SrcIdx == -1)
1340 continue;
1341 int Reg;
1342 if (SrcIdx < (int)NumSrcElts) {
1343 Reg = SrcIdx / EltsPerReg;
1344 if (SrcIdx % EltsPerReg != I)
1345 Aligned = false;
1346 } else {
1347 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1348 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1349 Aligned = false;
1350 }
1351 if (!llvm::is_contained(Regs, Reg))
1352 Regs.push_back(Reg);
1353 }
1354 if (Regs.size() >= 2)
1355 Cost += Regs.size() - 1;
1356 else if (!Aligned)
1357 Cost += 1;
1358 }
1359 return Cost;
1360 }
1361 }
1362
1363 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1364 SubTp);
1365}
1366
1367/// Whether it is profitable to sink the operands of an
1368/// Instruction I to the basic block of I.
1369/// This helps using several modifiers (like abs and neg) more often.
1371 SmallVectorImpl<Use *> &Ops) const {
1372 using namespace PatternMatch;
1373
1374 for (auto &Op : I->operands()) {
1375 // Ensure we are not already sinking this operand.
1376 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1377 continue;
1378
1379 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
1380 Ops.push_back(&Op);
1381 }
1382
1383 return !Ops.empty();
1384}
1385
1387 const Function *Callee) const {
1388 const TargetMachine &TM = getTLI()->getTargetMachine();
1389 const GCNSubtarget *CallerST
1390 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1391 const GCNSubtarget *CalleeST
1392 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1393
1394 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1395 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1396
1397 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1398 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1399 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1400 return false;
1401
1402 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1403 // no way to support merge for backend defined attributes.
1404 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1405 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1406 if (!CallerMode.isInlineCompatible(CalleeMode))
1407 return false;
1408
1409 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1410 Callee->hasFnAttribute(Attribute::InlineHint))
1411 return true;
1412
1413 // Hack to make compile times reasonable.
1414 if (InlineMaxBB) {
1415 // Single BB does not increase total BB amount.
1416 if (Callee->size() == 1)
1417 return true;
1418 size_t BBSize = Caller->size() + Callee->size() - 1;
1419 return BBSize <= InlineMaxBB;
1420 }
1421
1422 return true;
1423}
1424
1426 const SITargetLowering *TLI,
1427 const GCNTTIImpl *TTIImpl) {
1428 const int NrOfSGPRUntilSpill = 26;
1429 const int NrOfVGPRUntilSpill = 32;
1430
1431 const DataLayout &DL = TTIImpl->getDataLayout();
1432
1433 unsigned adjustThreshold = 0;
1434 int SGPRsInUse = 0;
1435 int VGPRsInUse = 0;
1436 for (const Use &A : CB->args()) {
1437 SmallVector<EVT, 4> ValueVTs;
1438 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1439 for (auto ArgVT : ValueVTs) {
1440 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1441 CB->getContext(), CB->getCallingConv(), ArgVT);
1443 SGPRsInUse += CCRegNum;
1444 else
1445 VGPRsInUse += CCRegNum;
1446 }
1447 }
1448
1449 // The cost of passing function arguments through the stack:
1450 // 1 instruction to put a function argument on the stack in the caller.
1451 // 1 instruction to take a function argument from the stack in callee.
1452 // 1 instruction is explicitly take care of data dependencies in callee
1453 // function.
1454 InstructionCost ArgStackCost(1);
1455 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1456 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1458 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1459 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1461
1462 // The penalty cost is computed relative to the cost of instructions and does
1463 // not model any storage costs.
1464 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1465 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1466 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1467 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1468 return adjustThreshold;
1469}
1470
1471static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1472 const DataLayout &DL) {
1473 // If we have a pointer to a private array passed into a function
1474 // it will not be optimized out, leaving scratch usage.
1475 // This function calculates the total size in bytes of the memory that would
1476 // end in scratch if the call was not inlined.
1477 unsigned AllocaSize = 0;
1479 for (Value *PtrArg : CB->args()) {
1480 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1481 if (!Ty)
1482 continue;
1483
1484 unsigned AddrSpace = Ty->getAddressSpace();
1485 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1486 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1487 continue;
1488
1490 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1491 continue;
1492
1493 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1494 }
1495 return AllocaSize;
1496}
1497
1502
1504 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1505
1506 // Private object passed as arguments may end up in scratch usage if the call
1507 // is not inlined. Increase the inline threshold to promote inlining.
1508 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1509 if (AllocaSize > 0)
1510 Threshold += ArgAllocaCost;
1511 return Threshold;
1512}
1513
1515 const AllocaInst *AI) const {
1516
1517 // Below the cutoff, assume that the private memory objects would be
1518 // optimized
1519 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1520 if (AllocaSize <= ArgAllocaCutoff)
1521 return 0;
1522
1523 // Above the cutoff, we give a cost to each private memory object
1524 // depending its size. If the array can be optimized by SROA this cost is not
1525 // added to the total-cost in the inliner cost analysis.
1526 //
1527 // We choose the total cost of the alloca such that their sum cancels the
1528 // bonus given in the threshold (ArgAllocaCost).
1529 //
1530 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1531 //
1532 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1533 // the single-bb bonus and the vector-bonus.
1534 //
1535 // We compensate the first two multipliers, by repeating logic from the
1536 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1537 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1538 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1539
1540 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1541 return BB.getTerminator()->getNumSuccessors() > 1;
1542 });
1543 if (SingleBB) {
1544 Threshold += Threshold / 2;
1545 }
1546
1547 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1548
1549 // Attribute the bonus proportionally to the alloca size
1550 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1551
1552 return AllocaThresholdBonus;
1553}
1554
1557 OptimizationRemarkEmitter *ORE) const {
1558 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1559}
1560
1562 TTI::PeelingPreferences &PP) const {
1563 CommonTTI.getPeelingPreferences(L, SE, PP);
1564}
1565
1566int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1567 return ST->hasFullRate64Ops()
1568 ? getFullRateInstrCost()
1569 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1570 : getQuarterRateInstrCost(CostKind);
1571}
1572
1573std::pair<InstructionCost, MVT>
1574GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1575 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1576 auto Size = DL.getTypeSizeInBits(Ty);
1577 // Maximum load or store can handle 8 dwords for scalar and 4 for
1578 // vector ALU. Let's assume anything above 8 dwords is expensive
1579 // even if legal.
1580 if (Size <= 256)
1581 return Cost;
1582
1583 Cost.first += (Size + 255) / 256;
1584 return Cost;
1585}
1586
1588 return ST->hasPrefetch() ? 128 : 0;
1589}
1590
1593}
1594
1596 const Function &F,
1597 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1598 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1599 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1600 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1601 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1602 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1603 ST->getFlatWorkGroupSizes(F);
1604 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1605 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1606 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1607 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1608 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1609}
1610
1613 if (!ST->hasIEEEMode()) // Only mode on gfx12
1614 return KnownIEEEMode::On;
1615
1616 const Function *F = I.getFunction();
1617 if (!F)
1619
1620 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1621 if (IEEEAttr.isValid())
1623
1624 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1626}
1627
1629 Align Alignment,
1630 unsigned AddressSpace,
1632 TTI::OperandValueInfo OpInfo,
1633 const Instruction *I) const {
1634 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1635 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1636 VecTy->getElementType()->isIntegerTy(8)) {
1637 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1639 }
1640 }
1641 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1642 OpInfo, I);
1643}
1644
1646 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1647 if (VecTy->getElementType()->isIntegerTy(8)) {
1648 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1649 return divideCeil(ElementCount - 1, 4);
1650 }
1651 }
1652 return BaseT::getNumberOfParts(Tp);
1653}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:223
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
bool hasFullRate64Ops() const
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool isAlwaysUniform(const Value *V) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isSourceOfDivergence(const Value *V) const override
int getInliningLastCallToStaticBonus() const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1078
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition Type.h:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:682
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:532
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition KnownBits.h:251
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...