LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(2700), cl::Hidden);
40
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(1000), cl::Hidden);
45
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(200), cl::Hidden);
50
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(true), cl::Hidden);
55
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering memcpy as a loop"),
84 cl::init(16), cl::Hidden);
85
86static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
87 unsigned Depth = 0) {
89 if (!I)
90 return false;
91
92 for (const Value *V : I->operand_values()) {
93 if (!L->contains(I))
94 continue;
95 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
96 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
97 return SubLoop->contains(PHI); }))
98 return true;
99 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
100 return true;
101 }
102 return false;
103}
104
106 : BaseT(TM, F.getDataLayout()),
107 TargetTriple(TM->getTargetTriple()),
108 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
109 TLI(ST->getTargetLowering()) {}
110
113 OptimizationRemarkEmitter *ORE) const {
114 const Function &F = *L->getHeader()->getParent();
115 UP.Threshold =
116 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
117 UP.MaxCount = std::numeric_limits<unsigned>::max();
118 UP.Partial = true;
119
120 // Conditional branch in a loop back edge needs 3 additional exec
121 // manipulations in average.
122 UP.BEInsns += 3;
123
124 // We want to run unroll even for the loops which have been vectorized.
125 UP.UnrollVectorizedLoop = true;
126
127 // TODO: Do we want runtime unrolling?
128
129 // Maximum alloca size than can fit registers. Reserve 16 registers.
130 const unsigned MaxAlloca = (256 - 16) * 4;
131 unsigned ThresholdPrivate = UnrollThresholdPrivate;
132 unsigned ThresholdLocal = UnrollThresholdLocal;
133
134 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
135 // provided threshold value as the default for Threshold
136 if (MDNode *LoopUnrollThreshold =
137 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
138 if (LoopUnrollThreshold->getNumOperands() == 2) {
140 LoopUnrollThreshold->getOperand(1));
141 if (MetaThresholdValue) {
142 // We will also use the supplied value for PartialThreshold for now.
143 // We may introduce additional metadata if it becomes necessary in the
144 // future.
145 UP.Threshold = MetaThresholdValue->getSExtValue();
147 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
148 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
149 }
150 }
151 }
152
153 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
154 for (const BasicBlock *BB : L->getBlocks()) {
155 const DataLayout &DL = BB->getDataLayout();
156 unsigned LocalGEPsSeen = 0;
157
158 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
159 return SubLoop->contains(BB); }))
160 continue; // Block belongs to an inner loop.
161
162 for (const Instruction &I : *BB) {
163 // Unroll a loop which contains an "if" statement whose condition
164 // defined by a PHI belonging to the loop. This may help to eliminate
165 // if region and potentially even PHI itself, saving on both divergence
166 // and registers used for the PHI.
167 // Add a small bonus for each of such "if" statements.
168 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
169 if (UP.Threshold < MaxBoost && Br->isConditional()) {
170 BasicBlock *Succ0 = Br->getSuccessor(0);
171 BasicBlock *Succ1 = Br->getSuccessor(1);
172 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
173 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
174 continue;
175 if (dependsOnLocalPhi(L, Br->getCondition())) {
177 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
178 << " for loop:\n"
179 << *L << " due to " << *Br << '\n');
180 if (UP.Threshold >= MaxBoost)
181 return;
182 }
183 }
184 continue;
185 }
186
188 if (!GEP)
189 continue;
190
191 unsigned AS = GEP->getAddressSpace();
192 unsigned Threshold = 0;
194 Threshold = ThresholdPrivate;
196 Threshold = ThresholdLocal;
197 else
198 continue;
199
200 if (UP.Threshold >= Threshold)
201 continue;
202
203 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
204 const Value *Ptr = GEP->getPointerOperand();
205 const AllocaInst *Alloca =
207 if (!Alloca || !Alloca->isStaticAlloca())
208 continue;
209 auto AllocaSize = Alloca->getAllocationSize(DL);
210 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
211 continue;
212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
214 LocalGEPsSeen++;
215 // Inhibit unroll for local memory if we have seen addressing not to
216 // a variable, most likely we will be unable to combine it.
217 // Do not unroll too deep inner loops for local memory to give a chance
218 // to unroll an outer loop for a more important reason.
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
220 continue;
221
222 const Value *V = getUnderlyingObject(GEP->getPointerOperand());
223 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
224 continue;
225
226 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
227 << *L << " due to LDS use.\n");
229 }
230
231 // Check if GEP depends on a value defined by this loop itself.
232 bool HasLoopDef = false;
233 for (const Value *Op : GEP->operands()) {
234 const Instruction *Inst = dyn_cast<Instruction>(Op);
235 if (!Inst || L->isLoopInvariant(Op))
236 continue;
237
238 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
239 return SubLoop->contains(Inst); }))
240 continue;
241 HasLoopDef = true;
242 break;
243 }
244 if (!HasLoopDef)
245 continue;
246
247 // We want to do whatever we can to limit the number of alloca
248 // instructions that make it through to the code generator. allocas
249 // require us to use indirect addressing, which is slow and prone to
250 // compiler bugs. If this loop does an address calculation on an
251 // alloca ptr, then we want to use a higher than normal loop unroll
252 // threshold. This will give SROA a better chance to eliminate these
253 // allocas.
254 //
255 // We also want to have more unrolling for local memory to let ds
256 // instructions with different offsets combine.
257 //
258 // Don't use the maximum allowed value here as it will make some
259 // programs way too big.
260 UP.Threshold = Threshold;
261 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
262 << " for loop:\n"
263 << *L << " due to " << *GEP << '\n');
264 if (UP.Threshold >= MaxBoost)
265 return;
266 }
267
268 // If we got a GEP in a small BB from inner loop then increase max trip
269 // count to analyze for better estimation cost in unroll
270 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
272 }
273}
274
279
283
284const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
285 // Codegen control options which don't matter.
286 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
287 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
288 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
289 AMDGPU::FeatureUnalignedAccessMode,
290
291 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
292
293 // Property of the kernel/environment which can't actually differ.
294 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
295 AMDGPU::FeatureTrapHandler,
296
297 // The default assumption needs to be ecc is enabled, but no directly
298 // exposed operations depend on it, so it can be safely inlined.
299 AMDGPU::FeatureSRAMECC,
300
301 // Perf-tuning features
302 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
303
305 : BaseT(TM, F.getDataLayout()),
306 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
307 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
308 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
310 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
311 HasFP64FP16Denormals =
312 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
313}
314
316 return !F || !ST->isSingleLaneExecution(*F);
317}
318
319unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
320 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
321 // registers. See getRegisterClassForType for the implementation.
322 // In this case vector registers are not vector in terms of
323 // VGPRs, but those which can hold multiple values.
324
325 // This is really the number of registers to fill when vectorizing /
326 // interleaving loops, so we lie to avoid trying to use all registers.
327 return 4;
328}
329
332 switch (K) {
334 return TypeSize::getFixed(32);
336 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
338 return TypeSize::getScalable(0);
339 }
340 llvm_unreachable("Unsupported register kind");
341}
342
344 return 32;
345}
346
347unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
348 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349 return 32 * 4 / ElemWidth;
350 // For a given width return the max 0number of elements that can be combined
351 // into a wider bit value:
352 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
353 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
354 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
355 : 1;
356}
357
358unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
359 unsigned ChainSizeInBytes,
360 VectorType *VecTy) const {
361 unsigned VecRegBitWidth = VF * LoadSize;
362 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
363 // TODO: Support element-size less than 32bit?
364 return 128 / LoadSize;
365
366 return VF;
367}
368
369unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
370 unsigned ChainSizeInBytes,
371 VectorType *VecTy) const {
372 unsigned VecRegBitWidth = VF * StoreSize;
373 if (VecRegBitWidth > 128)
374 return 128 / StoreSize;
375
376 return VF;
377}
378
379unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
380 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
381 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
383 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
384 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
386 return 512;
387 }
388
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
390 return 8 * ST->getMaxPrivateElementSize();
391
392 // Common to flat, global, local and region. Assume for unknown addrspace.
393 return 128;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 // We allow vectorization of flat stores, even though we may need to decompose
400 // them later if they may access private memory. We don't have enough context
401 // here, and legalization can handle it.
402 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
403 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
404 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
405 }
406 return true;
407}
408
409bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
410 Align Alignment,
411 unsigned AddrSpace) const {
412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
413}
414
415bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
416 Align Alignment,
417 unsigned AddrSpace) const {
418 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
419}
420
424
426 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
427 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
428 std::optional<uint32_t> AtomicElementSize) const {
429
430 if (AtomicElementSize)
431 return Type::getIntNTy(Context, *AtomicElementSize * 8);
432
433 // 16-byte accesses achieve the highest copy throughput.
434 // If the operation has a fixed known length that is large enough, it is
435 // worthwhile to return an even wider type and let legalization lower it into
436 // multiple accesses, effectively unrolling the memcpy loop.
437 // We also rely on legalization to decompose into smaller accesses for
438 // subtargets and address spaces where it is necessary.
439 //
440 // Don't unroll if Length is not a constant, since unrolling leads to worse
441 // performance for length values that are smaller or slightly larger than the
442 // total size of the type returned here. Mitigating that would require a more
443 // complex lowering for variable-length memcpy and memmove.
444 unsigned I32EltsInVector = 4;
447 MemcpyLoopUnroll * I32EltsInVector);
448
449 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
450}
451
453 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
454 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
455 Align SrcAlign, Align DestAlign,
456 std::optional<uint32_t> AtomicCpySize) const {
457
458 if (AtomicCpySize)
460 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
461 DestAlign, AtomicCpySize);
462
463 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
464 while (RemainingBytes >= 16) {
465 OpsOut.push_back(I32x4Ty);
466 RemainingBytes -= 16;
467 }
468
469 Type *I64Ty = Type::getInt64Ty(Context);
470 while (RemainingBytes >= 8) {
471 OpsOut.push_back(I64Ty);
472 RemainingBytes -= 8;
473 }
474
475 Type *I32Ty = Type::getInt32Ty(Context);
476 while (RemainingBytes >= 4) {
477 OpsOut.push_back(I32Ty);
478 RemainingBytes -= 4;
479 }
480
481 Type *I16Ty = Type::getInt16Ty(Context);
482 while (RemainingBytes >= 2) {
483 OpsOut.push_back(I16Ty);
484 RemainingBytes -= 2;
485 }
486
487 Type *I8Ty = Type::getInt8Ty(Context);
488 while (RemainingBytes) {
489 OpsOut.push_back(I8Ty);
490 --RemainingBytes;
491 }
492}
493
495 // Disable unrolling if the loop is not vectorized.
496 // TODO: Enable this again.
497 if (VF.isScalar())
498 return 1;
499
500 return 8;
501}
502
504 MemIntrinsicInfo &Info) const {
505 switch (Inst->getIntrinsicID()) {
506 case Intrinsic::amdgcn_ds_ordered_add:
507 case Intrinsic::amdgcn_ds_ordered_swap: {
508 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527}
528
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
532 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
533
534 // Legalize the type.
535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
536 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537
538 // Because we don't have any legal vector operations, but the legal types, we
539 // need to account for split vectors.
540 unsigned NElts = LT.second.isVector() ?
541 LT.second.getVectorNumElements() : 1;
542
543 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
544
545 switch (ISD) {
546 case ISD::SHL:
547 case ISD::SRL:
548 case ISD::SRA:
549 if (SLT == MVT::i64)
550 return get64BitInstrCost(CostKind) * LT.first * NElts;
551
552 if (ST->has16BitInsts() && SLT == MVT::i16)
553 NElts = (NElts + 1) / 2;
554
555 // i32
556 return getFullRateInstrCost() * LT.first * NElts;
557 case ISD::ADD:
558 case ISD::SUB:
559 case ISD::AND:
560 case ISD::OR:
561 case ISD::XOR:
562 if (SLT == MVT::i64) {
563 // and, or and xor are typically split into 2 VALU instructions.
564 return 2 * getFullRateInstrCost() * LT.first * NElts;
565 }
566
567 if (ST->has16BitInsts() && SLT == MVT::i16)
568 NElts = (NElts + 1) / 2;
569
570 return LT.first * NElts * getFullRateInstrCost();
571 case ISD::MUL: {
572 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
573 if (SLT == MVT::i64) {
574 const int FullRateCost = getFullRateInstrCost();
575 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
576 }
577
578 if (ST->has16BitInsts() && SLT == MVT::i16)
579 NElts = (NElts + 1) / 2;
580
581 // i32
582 return QuarterRateCost * NElts * LT.first;
583 }
584 case ISD::FMUL:
585 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
586 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
587 // fused operation.
588 if (CxtI && CxtI->hasOneUse())
589 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
590 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
591 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
592 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596
597 // Estimate all types may be fused with contract/unsafe flags
598 const TargetOptions &Options = TLI->getTargetMachine().Options;
599 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
602 }
603 }
604 [[fallthrough]];
605 case ISD::FADD:
606 case ISD::FSUB:
607 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
608 NElts = (NElts + 1) / 2;
609 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
629 if (!ST->hasUsableDivScaleConditionOutput())
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
655 // Fast unsafe fdiv lowering:
656 // f32 rcp
657 // f32 fmul
658 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
659 return LT.first * Cost * NElts;
660 }
661
662 if (SLT == MVT::f32 || SLT == MVT::f16) {
663 // 4 more v_cvt_* insts without f16 insts support
664 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
665 1 * getQuarterRateInstrCost(CostKind);
666
667 if (!HasFP32Denormals) {
668 // FP mode switches.
669 Cost += 2 * getFullRateInstrCost();
670 }
671
672 return LT.first * NElts * Cost;
673 }
674 break;
675 case ISD::FNEG:
676 // Use the backend' estimation. If fneg is not free each element will cost
677 // one additional instruction.
678 return TLI->isFNegFree(SLT) ? 0 : NElts;
679 default:
680 break;
681 }
682
683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
684 Args, CxtI);
685}
686
687// Return true if there's a potential benefit from using v2f16/v2i16
688// instructions for an intrinsic, even if it requires nontrivial legalization.
690 switch (ID) {
691 case Intrinsic::fma:
692 case Intrinsic::fmuladd:
693 case Intrinsic::copysign:
694 case Intrinsic::minimumnum:
695 case Intrinsic::maximumnum:
696 case Intrinsic::canonicalize:
697 // There's a small benefit to using vector ops in the legalized code.
698 case Intrinsic::round:
699 case Intrinsic::uadd_sat:
700 case Intrinsic::usub_sat:
701 case Intrinsic::sadd_sat:
702 case Intrinsic::ssub_sat:
703 case Intrinsic::abs:
704 return true;
705 default:
706 return false;
707 }
708}
709
713 switch (ICA.getID()) {
714 case Intrinsic::fabs:
715 // Free source modifier in the common case.
716 return 0;
717 case Intrinsic::amdgcn_workitem_id_x:
718 case Intrinsic::amdgcn_workitem_id_y:
719 case Intrinsic::amdgcn_workitem_id_z:
720 // TODO: If hasPackedTID, or if the calling context is not an entry point
721 // there may be a bit instruction.
722 return 0;
723 case Intrinsic::amdgcn_workgroup_id_x:
724 case Intrinsic::amdgcn_workgroup_id_y:
725 case Intrinsic::amdgcn_workgroup_id_z:
726 case Intrinsic::amdgcn_lds_kernel_id:
727 case Intrinsic::amdgcn_dispatch_ptr:
728 case Intrinsic::amdgcn_dispatch_id:
729 case Intrinsic::amdgcn_implicitarg_ptr:
730 case Intrinsic::amdgcn_queue_ptr:
731 // Read from an argument register.
732 return 0;
733 default:
734 break;
735 }
736
739
740 Type *RetTy = ICA.getReturnType();
741
742 // Legalize the type.
743 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
744
745 unsigned NElts = LT.second.isVector() ?
746 LT.second.getVectorNumElements() : 1;
747
748 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
749
750 if ((ST->hasVOP3PInsts() &&
751 (SLT == MVT::f16 || SLT == MVT::i16 ||
752 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
753 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
754 NElts = (NElts + 1) / 2;
755
756 // TODO: Get more refined intrinsic costs?
757 unsigned InstRate = getQuarterRateInstrCost(CostKind);
758
759 switch (ICA.getID()) {
760 case Intrinsic::fma:
761 case Intrinsic::fmuladd:
762 if (SLT == MVT::f64) {
763 InstRate = get64BitInstrCost(CostKind);
764 break;
765 }
766
767 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
768 InstRate = getFullRateInstrCost();
769 else {
770 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
771 : getQuarterRateInstrCost(CostKind);
772 }
773 break;
774 case Intrinsic::copysign:
775 return NElts * getFullRateInstrCost();
776 case Intrinsic::minimumnum:
777 case Intrinsic::maximumnum: {
778 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
779 // promotion takes the place of the canonicalize.
780 unsigned NumOps = 3;
781 if (const IntrinsicInst *II = ICA.getInst()) {
782 // Directly legal with ieee=0
783 // TODO: Not directly legal with strictfp
785 NumOps = 1;
786 }
787
788 unsigned BaseRate =
789 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
790 InstRate = BaseRate * NumOps;
791 break;
792 }
793 case Intrinsic::canonicalize: {
794 InstRate =
795 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
796 break;
797 }
798 case Intrinsic::uadd_sat:
799 case Intrinsic::usub_sat:
800 case Intrinsic::sadd_sat:
801 case Intrinsic::ssub_sat: {
802 if (SLT == MVT::i16 || SLT == MVT::i32)
803 InstRate = getFullRateInstrCost();
804
805 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
806 if (any_of(ValidSatTys, equal_to(LT.second)))
807 NElts = 1;
808 break;
809 }
810 case Intrinsic::abs:
811 // Expansion takes 2 instructions for VALU
812 if (SLT == MVT::i16 || SLT == MVT::i32)
813 InstRate = 2 * getFullRateInstrCost();
814 break;
815 default:
816 break;
817 }
818
819 return LT.first * NElts * InstRate;
820}
821
824 const Instruction *I) const {
825 assert((I == nullptr || I->getOpcode() == Opcode) &&
826 "Opcode should reflect passed instruction.");
827 const bool SCost =
829 const int CBrCost = SCost ? 5 : 7;
830 switch (Opcode) {
831 case Instruction::Br: {
832 // Branch instruction takes about 4 slots on gfx900.
833 const auto *BI = dyn_cast_or_null<BranchInst>(I);
834 if (BI && BI->isUnconditional())
835 return SCost ? 1 : 4;
836 // Suppose conditional branch takes additional 3 exec manipulations
837 // instructions in average.
838 return CBrCost;
839 }
840 case Instruction::Switch: {
841 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
842 // Each case (including default) takes 1 cmp + 1 cbr instructions in
843 // average.
844 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
845 }
846 case Instruction::Ret:
847 return SCost ? 1 : 10;
848 }
849 return BaseT::getCFInstrCost(Opcode, CostKind, I);
850}
851
854 std::optional<FastMathFlags> FMF,
857 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
858
859 EVT OrigTy = TLI->getValueType(DL, Ty);
860
861 // Computes cost on targets that have packed math instructions(which support
862 // 16-bit types only).
863 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
864 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
865
866 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
867 return LT.first * getFullRateInstrCost();
868}
869
872 FastMathFlags FMF,
874 EVT OrigTy = TLI->getValueType(DL, Ty);
875
876 // Computes cost on targets that have packed math instructions(which support
877 // 16-bit types only).
878 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
879 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
880
881 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
882 return LT.first * getHalfRateInstrCost(CostKind);
883}
884
886 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
887 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
888 switch (Opcode) {
889 case Instruction::ExtractElement:
890 case Instruction::InsertElement: {
891 unsigned EltSize
892 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
893 if (EltSize < 32) {
894 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
895 return 0;
896 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
897 VIC);
898 }
899
900 // Extracts are just reads of a subregister, so are free. Inserts are
901 // considered free because we don't want to have any cost for scalarizing
902 // operations, and we don't have to copy into a different register class.
903
904 // Dynamic indexing isn't free and is best avoided.
905 return Index == ~0u ? 2 : 0;
906 }
907 default:
908 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
909 VIC);
910 }
911}
912
913/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
914/// this is analyzing the collective result of all output registers. Otherwise,
915/// this is only querying a specific result index if this returns multiple
916/// registers in a struct.
918 const CallInst *CI, ArrayRef<unsigned> Indices) const {
919 // TODO: Handle complex extract indices
920 if (Indices.size() > 1)
921 return true;
922
923 const DataLayout &DL = CI->getDataLayout();
924 const SIRegisterInfo *TRI = ST->getRegisterInfo();
925 TargetLowering::AsmOperandInfoVector TargetConstraints =
926 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
927
928 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
929
930 int OutputIdx = 0;
931 for (auto &TC : TargetConstraints) {
932 if (TC.Type != InlineAsm::isOutput)
933 continue;
934
935 // Skip outputs we don't care about.
936 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
937 continue;
938
939 TLI->ComputeConstraintToUse(TC, SDValue());
940
941 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
942 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
943
944 // For AGPR constraints null is returned on subtargets without AGPRs, so
945 // assume divergent for null.
946 if (!RC || !TRI->isSGPRClass(RC))
947 return true;
948 }
949
950 return false;
951}
952
954 const IntrinsicInst *ReadReg) const {
955 Metadata *MD =
956 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
958 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
959
960 // Special case registers that look like VCC.
961 MVT VT = MVT::getVT(ReadReg->getType());
962 if (VT == MVT::i1)
963 return true;
964
965 // Special case scalar registers that start with 'v'.
966 if (RegName.starts_with("vcc") || RegName.empty())
967 return false;
968
969 // VGPR or AGPR is divergent. There aren't any specially named vector
970 // registers.
971 return RegName[0] == 'v' || RegName[0] == 'a';
972}
973
974/// \returns true if the result of the value could potentially be
975/// different across workitems in a wavefront.
976bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
977 if (const Argument *A = dyn_cast<Argument>(V))
979
980 // Loads from the private and flat address spaces are divergent, because
981 // threads can execute the load instruction with the same inputs and get
982 // different results.
983 //
984 // All other loads are not divergent, because if threads issue loads with the
985 // same arguments, they will always get the same result.
986 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
987 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
988 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
989
990 // Atomics are divergent because they are executed sequentially: when an
991 // atomic operation refers to the same address in each thread, then each
992 // thread after the first sees the value written by the previous thread as
993 // original value.
995 return true;
996
998 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
999 switch (IID) {
1000 case Intrinsic::read_register:
1002 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1003 unsigned SrcAS =
1004 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1005 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1006 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1007 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1008 ST->hasGloballyAddressableScratch();
1009 }
1010 case Intrinsic::amdgcn_workitem_id_y:
1011 case Intrinsic::amdgcn_workitem_id_z: {
1012 const Function *F = Intrinsic->getFunction();
1013 bool HasUniformYZ =
1014 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1015 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1016 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1017 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1018 }
1019 default:
1021 }
1022 }
1023
1024 // Assume all function calls are a source of divergence.
1025 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1026 if (CI->isInlineAsm())
1028 return true;
1029 }
1030
1031 // Assume all function calls are a source of divergence.
1032 if (isa<InvokeInst>(V))
1033 return true;
1034
1035 // If the target supports globally addressable scratch, the mapping from
1036 // scratch memory to the flat aperture changes therefore an address space cast
1037 // is no longer uniform.
1038 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1039 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1040 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1041 ST->hasGloballyAddressableScratch();
1042 }
1043
1044 return false;
1045}
1046
1047bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1048 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1049 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1050
1051 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1052 if (CI->isInlineAsm())
1054 return false;
1055 }
1056
1057 // In most cases TID / wavefrontsize is uniform.
1058 //
1059 // However, if a kernel has uneven dimesions we can have a value of
1060 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1061 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1062 // packed into a same wave which gives 1 and 0 after the division by 64
1063 // respectively.
1064 //
1065 // The X dimension doesn't reset within a wave if either both the Y
1066 // and Z dimensions are of length 1, or if the X dimension's required
1067 // size is a power of 2. Note, however, if the X dimension's maximum
1068 // size is a power of 2 < the wavefront size, division by the wavefront
1069 // size is guaranteed to yield 0, so this is also a no-reset case.
1070 bool XDimDoesntResetWithinWaves = false;
1071 if (auto *I = dyn_cast<Instruction>(V)) {
1072 const Function *F = I->getFunction();
1073 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1074 }
1075 using namespace llvm::PatternMatch;
1076 uint64_t C;
1078 m_ConstantInt(C))) ||
1080 m_ConstantInt(C)))) {
1081 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1082 }
1083
1084 Value *Mask;
1086 m_Value(Mask)))) {
1087 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1088 ST->getWavefrontSizeLog2() &&
1089 XDimDoesntResetWithinWaves;
1090 }
1091
1092 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1093 if (!ExtValue)
1094 return false;
1095
1096 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1097 if (!CI)
1098 return false;
1099
1100 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1101 switch (Intrinsic->getIntrinsicID()) {
1102 default:
1103 return false;
1104 case Intrinsic::amdgcn_if:
1105 case Intrinsic::amdgcn_else: {
1106 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1107 return Indices.size() == 1 && Indices[0] == 1;
1108 }
1109 }
1110 }
1111
1112 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1113 // divergent for the overall struct return. We need to override it in the
1114 // case we're extracting an SGPR component here.
1115 if (CI->isInlineAsm())
1116 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1117
1118 return false;
1119}
1120
1122 Intrinsic::ID IID) const {
1123 switch (IID) {
1124 case Intrinsic::amdgcn_is_shared:
1125 case Intrinsic::amdgcn_is_private:
1126 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1127 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1128 case Intrinsic::amdgcn_load_to_lds:
1129 case Intrinsic::amdgcn_make_buffer_rsrc:
1130 OpIndexes.push_back(0);
1131 return true;
1132 default:
1133 return false;
1134 }
1135}
1136
1138 Value *OldV,
1139 Value *NewV) const {
1140 auto IntrID = II->getIntrinsicID();
1141 switch (IntrID) {
1142 case Intrinsic::amdgcn_is_shared:
1143 case Intrinsic::amdgcn_is_private: {
1144 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1146 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1147 LLVMContext &Ctx = NewV->getType()->getContext();
1148 ConstantInt *NewVal = (TrueAS == NewAS) ?
1150 return NewVal;
1151 }
1152 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1153 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1154 Type *DestTy = II->getType();
1155 Type *SrcTy = NewV->getType();
1156 unsigned NewAS = SrcTy->getPointerAddressSpace();
1158 return nullptr;
1159 Module *M = II->getModule();
1161 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1162 II->setArgOperand(0, NewV);
1163 II->setCalledFunction(NewDecl);
1164 return II;
1165 }
1166 case Intrinsic::amdgcn_load_to_lds: {
1167 Type *SrcTy = NewV->getType();
1168 Module *M = II->getModule();
1169 Function *NewDecl =
1170 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1171 II->setArgOperand(0, NewV);
1172 II->setCalledFunction(NewDecl);
1173 return II;
1174 }
1175 case Intrinsic::amdgcn_make_buffer_rsrc: {
1176 Type *SrcTy = NewV->getType();
1177 Type *DstTy = II->getType();
1178 Module *M = II->getModule();
1180 M, II->getIntrinsicID(), {DstTy, SrcTy});
1181 II->setArgOperand(0, NewV);
1182 II->setCalledFunction(NewDecl);
1183 return II;
1184 }
1185 default:
1186 return nullptr;
1187 }
1188}
1189
1191 VectorType *DstTy, VectorType *SrcTy,
1192 ArrayRef<int> Mask,
1194 int Index, VectorType *SubTp,
1196 const Instruction *CxtI) const {
1197 if (!isa<FixedVectorType>(SrcTy))
1198 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1199 SubTp);
1200
1201 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1202
1203 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1204 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1205 (ScalarSize == 16 || ScalarSize == 8)) {
1206 // Larger vector widths may require additional instructions, but are
1207 // typically cheaper than scalarized versions.
1208 //
1209 // We assume that shuffling at a register granularity can be done for free.
1210 // This is not true for vectors fed into memory instructions, but it is
1211 // effectively true for all other shuffling. The emphasis of the logic here
1212 // is to assist generic transform in cleaning up / canonicalizing those
1213 // shuffles.
1214
1215 // With op_sel VOP3P instructions freely can access the low half or high
1216 // half of a register, so any swizzle of two elements is free.
1217 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1218 unsigned NumSrcElts = SrcVecTy->getNumElements();
1219 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1220 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1221 Kind == TTI::SK_PermuteSingleSrc))
1222 return 0;
1223 }
1224
1225 unsigned EltsPerReg = 32 / ScalarSize;
1226 switch (Kind) {
1227 case TTI::SK_Broadcast:
1228 // A single v_perm_b32 can be re-used for all destination registers.
1229 return 1;
1230 case TTI::SK_Reverse:
1231 // One instruction per register.
1232 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1233 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1236 if (Index % EltsPerReg == 0)
1237 return 0; // Shuffling at register granularity
1238 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1239 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1242 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1243 if (!DstVecTy)
1245 unsigned NumDstElts = DstVecTy->getNumElements();
1246 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1247 unsigned EndIndex = Index + NumInsertElts;
1248 unsigned BeginSubIdx = Index % EltsPerReg;
1249 unsigned EndSubIdx = EndIndex % EltsPerReg;
1250 unsigned Cost = 0;
1251
1252 if (BeginSubIdx != 0) {
1253 // Need to shift the inserted vector into place. The cost is the number
1254 // of destination registers overlapped by the inserted vector.
1255 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1256 }
1257
1258 // If the last register overlap is partial, there may be three source
1259 // registers feeding into it; that takes an extra instruction.
1260 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1261 Cost += 1;
1262
1263 return Cost;
1264 }
1265 case TTI::SK_Splice: {
1266 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1267 if (!DstVecTy)
1269 unsigned NumElts = DstVecTy->getNumElements();
1270 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1271 // Determine the sub-region of the result vector that requires
1272 // sub-register shuffles / mixing.
1273 unsigned EltsFromLHS = NumElts - Index;
1274 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1275 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1276 if (LHSIsAligned && RHSIsAligned)
1277 return 0;
1278 if (LHSIsAligned && !RHSIsAligned)
1279 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1280 if (!LHSIsAligned && RHSIsAligned)
1281 return divideCeil(EltsFromLHS, EltsPerReg);
1282 return divideCeil(NumElts, EltsPerReg);
1283 }
1284 default:
1285 break;
1286 }
1287
1288 if (!Mask.empty()) {
1289 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1290
1291 // Generically estimate the cost by assuming that each destination
1292 // register is derived from sources via v_perm_b32 instructions if it
1293 // can't be copied as-is.
1294 //
1295 // For each destination register, derive the cost of obtaining it based
1296 // on the number of source registers that feed into it.
1297 unsigned Cost = 0;
1298 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1300 bool Aligned = true;
1301 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1302 int SrcIdx = Mask[DstIdx + I];
1303 if (SrcIdx == -1)
1304 continue;
1305 int Reg;
1306 if (SrcIdx < (int)NumSrcElts) {
1307 Reg = SrcIdx / EltsPerReg;
1308 if (SrcIdx % EltsPerReg != I)
1309 Aligned = false;
1310 } else {
1311 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1312 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1313 Aligned = false;
1314 }
1315 if (!llvm::is_contained(Regs, Reg))
1316 Regs.push_back(Reg);
1317 }
1318 if (Regs.size() >= 2)
1319 Cost += Regs.size() - 1;
1320 else if (!Aligned)
1321 Cost += 1;
1322 }
1323 return Cost;
1324 }
1325 }
1326
1327 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1328 SubTp);
1329}
1330
1331/// Whether it is profitable to sink the operands of an
1332/// Instruction I to the basic block of I.
1333/// This helps using several modifiers (like abs and neg) more often.
1335 SmallVectorImpl<Use *> &Ops) const {
1336 using namespace PatternMatch;
1337
1338 for (auto &Op : I->operands()) {
1339 // Ensure we are not already sinking this operand.
1340 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1341 continue;
1342
1343 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
1344 Ops.push_back(&Op);
1345 }
1346
1347 return !Ops.empty();
1348}
1349
1351 const Function *Callee) const {
1352 const TargetMachine &TM = getTLI()->getTargetMachine();
1353 const GCNSubtarget *CallerST
1354 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1355 const GCNSubtarget *CalleeST
1356 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1357
1358 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1359 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1360
1361 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1362 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1363 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1364 return false;
1365
1366 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1367 // no way to support merge for backend defined attributes.
1368 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1369 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1370 if (!CallerMode.isInlineCompatible(CalleeMode))
1371 return false;
1372
1373 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1374 Callee->hasFnAttribute(Attribute::InlineHint))
1375 return true;
1376
1377 // Hack to make compile times reasonable.
1378 if (InlineMaxBB) {
1379 // Single BB does not increase total BB amount.
1380 if (Callee->size() == 1)
1381 return true;
1382 size_t BBSize = Caller->size() + Callee->size() - 1;
1383 return BBSize <= InlineMaxBB;
1384 }
1385
1386 return true;
1387}
1388
1390 const SITargetLowering *TLI,
1391 const GCNTTIImpl *TTIImpl) {
1392 const int NrOfSGPRUntilSpill = 26;
1393 const int NrOfVGPRUntilSpill = 32;
1394
1395 const DataLayout &DL = TTIImpl->getDataLayout();
1396
1397 unsigned adjustThreshold = 0;
1398 int SGPRsInUse = 0;
1399 int VGPRsInUse = 0;
1400 for (const Use &A : CB->args()) {
1401 SmallVector<EVT, 4> ValueVTs;
1402 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1403 for (auto ArgVT : ValueVTs) {
1404 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1405 CB->getContext(), CB->getCallingConv(), ArgVT);
1407 SGPRsInUse += CCRegNum;
1408 else
1409 VGPRsInUse += CCRegNum;
1410 }
1411 }
1412
1413 // The cost of passing function arguments through the stack:
1414 // 1 instruction to put a function argument on the stack in the caller.
1415 // 1 instruction to take a function argument from the stack in callee.
1416 // 1 instruction is explicitly take care of data dependencies in callee
1417 // function.
1418 InstructionCost ArgStackCost(1);
1419 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1420 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1422 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1423 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1425
1426 // The penalty cost is computed relative to the cost of instructions and does
1427 // not model any storage costs.
1428 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1429 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1430 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1431 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1432 return adjustThreshold;
1433}
1434
1435static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1436 const DataLayout &DL) {
1437 // If we have a pointer to a private array passed into a function
1438 // it will not be optimized out, leaving scratch usage.
1439 // This function calculates the total size in bytes of the memory that would
1440 // end in scratch if the call was not inlined.
1441 unsigned AllocaSize = 0;
1443 for (Value *PtrArg : CB->args()) {
1444 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1445 if (!Ty)
1446 continue;
1447
1448 unsigned AddrSpace = Ty->getAddressSpace();
1449 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1450 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1451 continue;
1452
1454 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1455 continue;
1456
1457 if (auto Size = AI->getAllocationSize(DL))
1458 AllocaSize += Size->getFixedValue();
1459 }
1460 return AllocaSize;
1461}
1462
1467
1469 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1470
1471 // Private object passed as arguments may end up in scratch usage if the call
1472 // is not inlined. Increase the inline threshold to promote inlining.
1473 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1474 if (AllocaSize > 0)
1475 Threshold += ArgAllocaCost;
1476 return Threshold;
1477}
1478
1480 const AllocaInst *AI) const {
1481
1482 // Below the cutoff, assume that the private memory objects would be
1483 // optimized
1484 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1485 if (AllocaSize <= ArgAllocaCutoff)
1486 return 0;
1487
1488 // Above the cutoff, we give a cost to each private memory object
1489 // depending its size. If the array can be optimized by SROA this cost is not
1490 // added to the total-cost in the inliner cost analysis.
1491 //
1492 // We choose the total cost of the alloca such that their sum cancels the
1493 // bonus given in the threshold (ArgAllocaCost).
1494 //
1495 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1496 //
1497 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1498 // the single-bb bonus and the vector-bonus.
1499 //
1500 // We compensate the first two multipliers, by repeating logic from the
1501 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1502 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1503 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1504
1505 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1506 return BB.getTerminator()->getNumSuccessors() > 1;
1507 });
1508 if (SingleBB) {
1509 Threshold += Threshold / 2;
1510 }
1511
1512 auto ArgAllocaSize = AI->getAllocationSize(DL);
1513 if (!ArgAllocaSize)
1514 return 0;
1515
1516 // Attribute the bonus proportionally to the alloca size
1517 unsigned AllocaThresholdBonus =
1518 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1519
1520 return AllocaThresholdBonus;
1521}
1522
1525 OptimizationRemarkEmitter *ORE) const {
1526 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1527}
1528
1530 TTI::PeelingPreferences &PP) const {
1531 CommonTTI.getPeelingPreferences(L, SE, PP);
1532}
1533
1534int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1535 return ST->hasFullRate64Ops()
1536 ? getFullRateInstrCost()
1537 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1538 : getQuarterRateInstrCost(CostKind);
1539}
1540
1541std::pair<InstructionCost, MVT>
1542GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1543 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1544 auto Size = DL.getTypeSizeInBits(Ty);
1545 // Maximum load or store can handle 8 dwords for scalar and 4 for
1546 // vector ALU. Let's assume anything above 8 dwords is expensive
1547 // even if legal.
1548 if (Size <= 256)
1549 return Cost;
1550
1551 Cost.first += (Size + 255) / 256;
1552 return Cost;
1553}
1554
1556 return ST->hasPrefetch() ? 128 : 0;
1557}
1558
1561}
1562
1564 const Function &F,
1565 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1566 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1567 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1568 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1569 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1570 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1571 ST->getFlatWorkGroupSizes(F);
1572 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1573 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1574 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1575 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1576 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1577}
1578
1581 if (!ST->hasIEEEMode()) // Only mode on gfx12
1582 return KnownIEEEMode::On;
1583
1584 const Function *F = I.getFunction();
1585 if (!F)
1587
1588 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1589 if (IEEEAttr.isValid())
1591
1592 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1594}
1595
1597 Align Alignment,
1598 unsigned AddressSpace,
1600 TTI::OperandValueInfo OpInfo,
1601 const Instruction *I) const {
1602 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1603 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1604 VecTy->getElementType()->isIntegerTy(8)) {
1605 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1607 }
1608 }
1609 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1610 OpInfo, I);
1611}
1612
1614 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1615 if (VecTy->getElementType()->isIntegerTy(8)) {
1616 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1617 return divideCeil(ElementCount - 1, 4);
1618 }
1619 }
1620 return BaseT::getNumberOfParts(Tp);
1621}
1622
1625 if (isAlwaysUniform(V))
1627
1628 if (isSourceOfDivergence(V))
1630
1632}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:103
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:259
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...