LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(2700), cl::Hidden);
42
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(1000), cl::Hidden);
47
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(200), cl::Hidden);
52
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(true), cl::Hidden);
57
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
92 if (!I)
93 return false;
94
95 for (const Value *V : I->operand_values()) {
96 if (!L->contains(I))
97 continue;
98 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
99 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
103 return true;
104 }
105 return false;
106}
107
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // Enable runtime unrolling for loops whose trip count is not known at
131 // compile time.
132 UP.Runtime = true;
133
134 // Maximum alloca size than can fit registers. Reserve 16 registers.
135 const unsigned MaxAlloca = (256 - 16) * 4;
136 unsigned ThresholdPrivate = UnrollThresholdPrivate;
137 unsigned ThresholdLocal = UnrollThresholdLocal;
138
139 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
140 // provided threshold value as the default for Threshold
141 if (MDNode *LoopUnrollThreshold =
142 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
143 if (LoopUnrollThreshold->getNumOperands() == 2) {
145 LoopUnrollThreshold->getOperand(1));
146 if (MetaThresholdValue) {
147 // We will also use the supplied value for PartialThreshold for now.
148 // We may introduce additional metadata if it becomes necessary in the
149 // future.
150 UP.Threshold = MetaThresholdValue->getSExtValue();
152 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
153 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
154 }
155 }
156 }
157
158 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
159 for (const BasicBlock *BB : L->getBlocks()) {
160 const DataLayout &DL = BB->getDataLayout();
161 unsigned LocalGEPsSeen = 0;
162
163 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
164 return SubLoop->contains(BB); }))
165 continue; // Block belongs to an inner loop.
166
167 for (const Instruction &I : *BB) {
168 // Unroll a loop which contains an "if" statement whose condition
169 // defined by a PHI belonging to the loop. This may help to eliminate
170 // if region and potentially even PHI itself, saving on both divergence
171 // and registers used for the PHI.
172 // Add a small bonus for each of such "if" statements.
173 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
174 if (UP.Threshold < MaxBoost) {
175 BasicBlock *Succ0 = Br->getSuccessor(0);
176 BasicBlock *Succ1 = Br->getSuccessor(1);
177 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
178 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
179 continue;
180 if (dependsOnLocalPhi(L, Br->getCondition())) {
182 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
183 << " for loop:\n"
184 << *L << " due to " << *Br << '\n');
185 if (UP.Threshold >= MaxBoost)
186 return;
187 }
188 }
189 continue;
190 }
191
193 if (!GEP)
194 continue;
195
196 unsigned AS = GEP->getAddressSpace();
197 unsigned Threshold = 0;
199 Threshold = ThresholdPrivate;
201 Threshold = ThresholdLocal;
202 else
203 continue;
204
205 if (UP.Threshold >= Threshold)
206 continue;
207
208 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
209 const Value *Ptr = GEP->getPointerOperand();
210 const AllocaInst *Alloca =
212 if (!Alloca || !Alloca->isStaticAlloca())
213 continue;
214 auto AllocaSize = Alloca->getAllocationSize(DL);
215 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
216 continue;
217 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
219 LocalGEPsSeen++;
220 // Inhibit unroll for local memory if we have seen addressing not to
221 // a variable, most likely we will be unable to combine it.
222 // Do not unroll too deep inner loops for local memory to give a chance
223 // to unroll an outer loop for a more important reason.
224 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
225 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
226 !isa<Argument>(GEP->getPointerOperand())))
227 continue;
228 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
229 << *L << " due to LDS use.\n");
231 }
232
233 // Check if GEP depends on a value defined by this loop itself.
234 bool HasLoopDef = false;
235 for (const Value *Op : GEP->operands()) {
236 const Instruction *Inst = dyn_cast<Instruction>(Op);
237 if (!Inst || L->isLoopInvariant(Op))
238 continue;
239
240 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
241 return SubLoop->contains(Inst); }))
242 continue;
243 HasLoopDef = true;
244 break;
245 }
246 if (!HasLoopDef)
247 continue;
248
249 // We want to do whatever we can to limit the number of alloca
250 // instructions that make it through to the code generator. allocas
251 // require us to use indirect addressing, which is slow and prone to
252 // compiler bugs. If this loop does an address calculation on an
253 // alloca ptr, then we want to use a higher than normal loop unroll
254 // threshold. This will give SROA a better chance to eliminate these
255 // allocas.
256 //
257 // We also want to have more unrolling for local memory to let ds
258 // instructions with different offsets combine.
259 //
260 // Don't use the maximum allowed value here as it will make some
261 // programs way too big.
262 UP.Threshold = Threshold;
263 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
264 << " for loop:\n"
265 << *L << " due to " << *GEP << '\n');
266 if (UP.Threshold >= MaxBoost)
267 return;
268 }
269
270 // If we got a GEP in a small BB from inner loop then increase max trip
271 // count to analyze for better estimation cost in unroll
272 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
274 }
275}
276
281
285
286const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
287 // Codegen control options which don't matter.
288 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
289 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
290 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
291
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
293
294 // Property of the kernel/environment which can't actually differ.
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureTrapHandler,
297
298 // The default assumption needs to be ecc is enabled, but no directly
299 // exposed operations depend on it, so it can be safely inlined.
300 AMDGPU::FeatureSRAMECC,
301
302 // Perf-tuning features
303 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
304
306 : BaseT(TM, F.getDataLayout()),
307 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
308 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
309 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
311 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
312 HasFP64FP16Denormals =
313 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
314}
315
317 return !F || !ST->isSingleLaneExecution(*F);
318}
319
320unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
321 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
322 // registers. See getRegisterClassForType for the implementation.
323 // In this case vector registers are not vector in terms of
324 // VGPRs, but those which can hold multiple values.
325
326 // This is really the number of registers to fill when vectorizing /
327 // interleaving loops, so we lie to avoid trying to use all registers.
328 return 4;
329}
330
333 switch (K) {
335 return TypeSize::getFixed(32);
337 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
339 return TypeSize::getScalable(0);
340 }
341 llvm_unreachable("Unsupported register kind");
342}
343
345 return 32;
346}
347
348unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
349 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
350 return 32 * 4 / ElemWidth;
351 // For a given width return the max 0number of elements that can be combined
352 // into a wider bit value:
353 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
354 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
355 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356 : 1;
357}
358
360 // The integer inst-count heuristic causes regressions on gfx94x and gfx950
361 // because 2-element vector trees that pass the scalar/vector instruction
362 // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
363 // after codegen, increasing register pressure and throughput cost without
364 // reducing the total instruction count.
365 return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
366}
367
368unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
369 unsigned ChainSizeInBytes,
370 VectorType *VecTy) const {
371 unsigned VecRegBitWidth = VF * LoadSize;
372 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
373 // TODO: Support element-size less than 32bit?
374 return 128 / LoadSize;
375
376 return VF;
377}
378
379unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
380 unsigned ChainSizeInBytes,
381 VectorType *VecTy) const {
382 unsigned VecRegBitWidth = VF * StoreSize;
383 if (VecRegBitWidth > 128)
384 return 128 / StoreSize;
385
386 return VF;
387}
388
389unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
390 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
391 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
393 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
394 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
396 return 512;
397 }
398
399 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
400 return 8 * ST->getMaxPrivateElementSize();
401
402 // Common to flat, global, local and region. Assume for unknown addrspace.
403 return 128;
404}
405
406bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
407 Align Alignment,
408 unsigned AddrSpace) const {
409 // We allow vectorization of flat stores, even though we may need to decompose
410 // them later if they may access private memory. We don't have enough context
411 // here, and legalization can handle it.
412 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
413 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
414 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
415 }
416 return true;
417}
418
419bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
420 Align Alignment,
421 unsigned AddrSpace) const {
422 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
423}
424
425bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
426 Align Alignment,
427 unsigned AddrSpace) const {
428 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
429}
430
434
436 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
437 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
438 std::optional<uint32_t> AtomicElementSize) const {
439
440 if (AtomicElementSize)
441 return Type::getIntNTy(Context, *AtomicElementSize * 8);
442
443 // 16-byte accesses achieve the highest copy throughput.
444 // If the operation has a fixed known length that is large enough, it is
445 // worthwhile to return an even wider type and let legalization lower it into
446 // multiple accesses, effectively unrolling the memcpy loop.
447 // We also rely on legalization to decompose into smaller accesses for
448 // subtargets and address spaces where it is necessary.
449 //
450 // Don't unroll if Length is not a constant, since unrolling leads to worse
451 // performance for length values that are smaller or slightly larger than the
452 // total size of the type returned here. Mitigating that would require a more
453 // complex lowering for variable-length memcpy and memmove.
454 unsigned I32EltsInVector = 4;
457 MemcpyLoopUnroll * I32EltsInVector);
458
459 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
460}
461
463 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
464 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
465 Align SrcAlign, Align DestAlign,
466 std::optional<uint32_t> AtomicCpySize) const {
467
468 if (AtomicCpySize)
470 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
471 DestAlign, AtomicCpySize);
472
473 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
474 while (RemainingBytes >= 16) {
475 OpsOut.push_back(I32x4Ty);
476 RemainingBytes -= 16;
477 }
478
479 Type *I64Ty = Type::getInt64Ty(Context);
480 while (RemainingBytes >= 8) {
481 OpsOut.push_back(I64Ty);
482 RemainingBytes -= 8;
483 }
484
485 Type *I32Ty = Type::getInt32Ty(Context);
486 while (RemainingBytes >= 4) {
487 OpsOut.push_back(I32Ty);
488 RemainingBytes -= 4;
489 }
490
491 Type *I16Ty = Type::getInt16Ty(Context);
492 while (RemainingBytes >= 2) {
493 OpsOut.push_back(I16Ty);
494 RemainingBytes -= 2;
495 }
496
497 Type *I8Ty = Type::getInt8Ty(Context);
498 while (RemainingBytes) {
499 OpsOut.push_back(I8Ty);
500 --RemainingBytes;
501 }
502}
503
505 // Disable unrolling if the loop is not vectorized.
506 // TODO: Enable this again.
507 if (VF.isScalar())
508 return 1;
509
510 return 8;
511}
512
514 MemIntrinsicInfo &Info) const {
515 switch (Inst->getIntrinsicID()) {
516 case Intrinsic::amdgcn_ds_ordered_add:
517 case Intrinsic::amdgcn_ds_ordered_swap: {
518 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
519 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
520 if (!Ordering || !Volatile)
521 return false; // Invalid.
522
523 unsigned OrderingVal = Ordering->getZExtValue();
524 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
525 return false;
526
527 Info.PtrVal = Inst->getArgOperand(0);
528 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
529 Info.ReadMem = true;
530 Info.WriteMem = true;
531 Info.IsVolatile = !Volatile->isZero();
532 return true;
533 }
534 default:
535 return false;
536 }
537}
538
540 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
542 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
543
544 // Legalize the type.
545 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
546 int ISD = TLI->InstructionOpcodeToISD(Opcode);
547
548 // Because we don't have any legal vector operations, but the legal types, we
549 // need to account for split vectors.
550 unsigned NElts = LT.second.isVector() ?
551 LT.second.getVectorNumElements() : 1;
552
553 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
554
555 switch (ISD) {
556 case ISD::SHL:
557 case ISD::SRL:
558 case ISD::SRA:
559 if (SLT == MVT::i64)
560 return get64BitInstrCost(CostKind) * LT.first * NElts;
561
562 if (ST->has16BitInsts() && SLT == MVT::i16)
563 NElts = (NElts + 1) / 2;
564
565 // i32
566 return getFullRateInstrCost() * LT.first * NElts;
567 case ISD::ADD:
568 case ISD::SUB:
569 case ISD::AND:
570 case ISD::OR:
571 case ISD::XOR:
572 if (SLT == MVT::i64) {
573 // and, or and xor are typically split into 2 VALU instructions.
574 return 2 * getFullRateInstrCost() * LT.first * NElts;
575 }
576
577 if (ST->has16BitInsts() && SLT == MVT::i16)
578 NElts = (NElts + 1) / 2;
579
580 return LT.first * NElts * getFullRateInstrCost();
581 case ISD::MUL: {
582 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
583 if (SLT == MVT::i64) {
584 const int FullRateCost = getFullRateInstrCost();
585 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
586 }
587
588 if (ST->has16BitInsts() && SLT == MVT::i16)
589 NElts = (NElts + 1) / 2;
590
591 // i32
592 return QuarterRateCost * NElts * LT.first;
593 }
594 case ISD::FMUL:
595 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
596 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
597 // fused operation.
598 if (CxtI && CxtI->hasOneUse())
599 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
600 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
601 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
602 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
604 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
606
607 // Estimate all types may be fused with contract/unsafe flags
608 const TargetOptions &Options = TLI->getTargetMachine().Options;
609 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
610 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
612 }
613 }
614 [[fallthrough]];
615 case ISD::FADD:
616 case ISD::FSUB:
617 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
618 NElts = (NElts + 1) / 2;
619 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
620 NElts = (NElts + 1) / 2;
621 if (SLT == MVT::f64)
622 return LT.first * NElts * get64BitInstrCost(CostKind);
623
624 if (ST->has16BitInsts() && SLT == MVT::f16)
625 NElts = (NElts + 1) / 2;
626
627 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
628 return LT.first * NElts * getFullRateInstrCost();
629 break;
630 case ISD::FDIV:
631 case ISD::FREM:
632 // FIXME: frem should be handled separately. The fdiv in it is most of it,
633 // but the current lowering is also not entirely correct.
634 if (SLT == MVT::f64) {
635 int Cost = 7 * get64BitInstrCost(CostKind) +
636 getQuarterRateInstrCost(CostKind) +
637 3 * getHalfRateInstrCost(CostKind);
638 // Add cost of workaround.
639 if (!ST->hasUsableDivScaleConditionOutput())
640 Cost += 3 * getFullRateInstrCost();
641
642 return LT.first * Cost * NElts;
643 }
644
645 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
646 // TODO: This is more complicated, unsafe flags etc.
647 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
648 (SLT == MVT::f16 && ST->has16BitInsts())) {
649 return LT.first * getTransInstrCost(CostKind) * NElts;
650 }
651 }
652
653 if (SLT == MVT::f16 && ST->has16BitInsts()) {
654 // 2 x v_cvt_f32_f16
655 // f32 rcp
656 // f32 fmul
657 // v_cvt_f16_f32
658 // f16 div_fixup
659 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
664 // Fast unsafe fdiv lowering:
665 // f32 rcp
666 // f32 fmul
667 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
668 return LT.first * Cost * NElts;
669 }
670
671 if (SLT == MVT::f32 || SLT == MVT::f16) {
672 // 4 more v_cvt_* insts without f16 insts support
673 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
674 1 * getTransInstrCost(CostKind);
675
676 if (!HasFP32Denormals) {
677 // FP mode switches.
678 Cost += 2 * getFullRateInstrCost();
679 }
680
681 return LT.first * NElts * Cost;
682 }
683 break;
684 case ISD::FNEG:
685 // Use the backend' estimation. If fneg is not free each element will cost
686 // one additional instruction.
687 return TLI->isFNegFree(SLT) ? 0 : NElts;
688 default:
689 break;
690 }
691
692 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
693 Args, CxtI);
694}
695
696// Return true if there's a potential benefit from using v2f16/v2i16
697// instructions for an intrinsic, even if it requires nontrivial legalization.
699 switch (ID) {
700 case Intrinsic::fma:
701 case Intrinsic::fmuladd:
702 case Intrinsic::copysign:
703 case Intrinsic::minimumnum:
704 case Intrinsic::maximumnum:
705 case Intrinsic::canonicalize:
706 // There's a small benefit to using vector ops in the legalized code.
707 case Intrinsic::round:
708 case Intrinsic::uadd_sat:
709 case Intrinsic::usub_sat:
710 case Intrinsic::sadd_sat:
711 case Intrinsic::ssub_sat:
712 case Intrinsic::abs:
713 return true;
714 default:
715 return false;
716 }
717}
718
722 switch (ICA.getID()) {
723 case Intrinsic::fabs:
724 // Free source modifier in the common case.
725 return 0;
726 case Intrinsic::amdgcn_workitem_id_x:
727 case Intrinsic::amdgcn_workitem_id_y:
728 case Intrinsic::amdgcn_workitem_id_z:
729 // TODO: If hasPackedTID, or if the calling context is not an entry point
730 // there may be a bit instruction.
731 return 0;
732 case Intrinsic::amdgcn_workgroup_id_x:
733 case Intrinsic::amdgcn_workgroup_id_y:
734 case Intrinsic::amdgcn_workgroup_id_z:
735 case Intrinsic::amdgcn_lds_kernel_id:
736 case Intrinsic::amdgcn_dispatch_ptr:
737 case Intrinsic::amdgcn_dispatch_id:
738 case Intrinsic::amdgcn_implicitarg_ptr:
739 case Intrinsic::amdgcn_queue_ptr:
740 // Read from an argument register.
741 return 0;
742 default:
743 break;
744 }
745
746 Type *RetTy = ICA.getReturnType();
747
748 Intrinsic::ID IID = ICA.getID();
749 switch (IID) {
750 case Intrinsic::exp:
751 case Intrinsic::exp2:
752 case Intrinsic::exp10: {
753 // Legalize the type.
754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
755 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
756 unsigned NElts =
757 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
758
759 if (SLT == MVT::f64) {
760 unsigned NumOps = 20;
761 if (IID == Intrinsic::exp)
762 ++NumOps;
763 else if (IID == Intrinsic::exp10)
764 NumOps += 3;
765
766 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
767 }
768
769 if (SLT == MVT::f32) {
770 unsigned NumFullRateOps = 0;
771 // v_exp_f32 (transcendental).
772 unsigned NumTransOps = 1;
773
774 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
775 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
776 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
777 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
778 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
779 } else {
780 if (IID == Intrinsic::exp) {
781 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
782 NumFullRateOps = 1;
783 } else if (IID == Intrinsic::exp10) {
784 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
785 NumFullRateOps = 3;
786 NumTransOps = 2;
787 }
788 // Denorm scaling adds setcc + select + fadd + select + fmul.
789 if (HasFP32Denormals)
790 NumFullRateOps += 5;
791 }
792
793 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
794 NumTransOps * getTransInstrCost(CostKind);
795 return LT.first * NElts * Cost;
796 }
797
798 break;
799 }
800 case Intrinsic::log:
801 case Intrinsic::log2:
802 case Intrinsic::log10: {
803 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
804 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
805 unsigned NElts =
806 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
807
808 if (SLT == MVT::f32) {
809 unsigned NumFullRateOps = 0;
810
811 if (IID == Intrinsic::log2) {
812 // LowerFLOG2: just v_log_f32.
813 } else if (ICA.getFlags().approxFunc()) {
814 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
815 NumFullRateOps = 1;
816 } else {
817 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
818 // multiply + finite check.
819 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
820 }
821
822 if (HasFP32Denormals)
823 NumFullRateOps += 5;
824
826 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
827 return LT.first * NElts * Cost;
828 }
829
830 break;
831 }
832 case Intrinsic::sin:
833 case Intrinsic::cos: {
834 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
835 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
836 unsigned NElts =
837 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
838
839 if (SLT == MVT::f32) {
840 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
841 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
842
844 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
845 return LT.first * NElts * Cost;
846 }
847
848 break;
849 }
850 case Intrinsic::sqrt: {
851 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
852 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
853 unsigned NElts =
854 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
855
856 if (SLT == MVT::f32) {
857 unsigned NumFullRateOps = 0;
858
859 if (!ICA.getFlags().approxFunc()) {
860 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
861 NumFullRateOps = HasFP32Denormals ? 17 : 16;
862 }
863
865 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
866 return LT.first * NElts * Cost;
867 }
868
869 break;
870 }
871 default:
872 break;
873 }
874
877
878 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
879 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
880 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
881
882 if ((ST->hasVOP3PInsts() &&
883 (SLT == MVT::f16 || SLT == MVT::i16 ||
884 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
885 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
886 NElts = (NElts + 1) / 2;
887
888 // TODO: Get more refined intrinsic costs?
889 unsigned InstRate = getQuarterRateInstrCost(CostKind);
890
891 switch (ICA.getID()) {
892 case Intrinsic::fma:
893 case Intrinsic::fmuladd:
894 if (SLT == MVT::f64) {
895 InstRate = get64BitInstrCost(CostKind);
896 break;
897 }
898
899 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
900 InstRate = getFullRateInstrCost();
901 else {
902 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
903 : getQuarterRateInstrCost(CostKind);
904 }
905 break;
906 case Intrinsic::copysign:
907 return NElts * getFullRateInstrCost();
908 case Intrinsic::minimumnum:
909 case Intrinsic::maximumnum: {
910 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
911 // promotion takes the place of the canonicalize.
912 unsigned NumOps = 3;
913 if (const IntrinsicInst *II = ICA.getInst()) {
914 // Directly legal with ieee=0
915 // TODO: Not directly legal with strictfp
917 NumOps = 1;
918 }
919
920 unsigned BaseRate =
921 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
922 InstRate = BaseRate * NumOps;
923 break;
924 }
925 case Intrinsic::canonicalize: {
926 InstRate =
927 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
928 break;
929 }
930 case Intrinsic::uadd_sat:
931 case Intrinsic::usub_sat:
932 case Intrinsic::sadd_sat:
933 case Intrinsic::ssub_sat: {
934 if (SLT == MVT::i16 || SLT == MVT::i32)
935 InstRate = getFullRateInstrCost();
936
937 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
938 if (any_of(ValidSatTys, equal_to(LT.second)))
939 NElts = 1;
940 break;
941 }
942 case Intrinsic::abs:
943 // Expansion takes 2 instructions for VALU
944 if (SLT == MVT::i16 || SLT == MVT::i32)
945 InstRate = 2 * getFullRateInstrCost();
946 break;
947 default:
948 break;
949 }
950
951 return LT.first * NElts * InstRate;
952}
953
956 const Instruction *I) const {
957 assert((I == nullptr || I->getOpcode() == Opcode) &&
958 "Opcode should reflect passed instruction.");
959 const bool SCost =
961 const int CBrCost = SCost ? 5 : 7;
962 switch (Opcode) {
963 case Instruction::UncondBr:
964 // Branch instruction takes about 4 slots on gfx900.
965 return SCost ? 1 : 4;
966 case Instruction::CondBr:
967 // Suppose conditional branch takes additional 3 exec manipulations
968 // instructions in average.
969 return CBrCost;
970 case Instruction::Switch: {
971 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
972 // Each case (including default) takes 1 cmp + 1 cbr instructions in
973 // average.
974 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
975 }
976 case Instruction::Ret:
977 return SCost ? 1 : 10;
978 }
979 return BaseT::getCFInstrCost(Opcode, CostKind, I);
980}
981
984 std::optional<FastMathFlags> FMF,
987 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
988
989 EVT OrigTy = TLI->getValueType(DL, Ty);
990
991 // Computes cost on targets that have packed math instructions(which support
992 // 16-bit types only).
993 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
994 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
995
996 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
997 return LT.first * getFullRateInstrCost();
998}
999
1002 FastMathFlags FMF,
1004 EVT OrigTy = TLI->getValueType(DL, Ty);
1005
1006 // Computes cost on targets that have packed math instructions(which support
1007 // 16-bit types only).
1008 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1009 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1010
1011 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1012 return LT.first * getHalfRateInstrCost(CostKind);
1013}
1014
1016 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1017 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1018 switch (Opcode) {
1019 case Instruction::ExtractElement:
1020 case Instruction::InsertElement: {
1021 unsigned EltSize
1022 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1023 // Dynamic indexing isn't free and is best avoided.
1024 if (Index == ~0u)
1025 return 2;
1026 if (EltSize < 32) {
1027 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1028 return 0;
1029 // Some i8 inserts and extracts are free so we want to reduce the
1030 // cost to avoid scalarization. We limit the zero cost cases to avoid
1031 // adversely impacting all i8 vectorizing.
1032 if (EltSize == 8) {
1033 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1034 if (NumElts >= 4 && isPowerOf2_32(NumElts)) {
1035 // Extracts at indices aligned to 32-bit boundaries (0, 4, 8, 12 for
1036 // v16i8) are free as they access the low byte of each VGPR. Other
1037 // indices require bit manipulation (shifts/byte selects) and cost 1.
1038 return Index % 4 == 0 ? 0 : 1;
1039 }
1040 }
1041 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1042 VIC);
1043 }
1044
1045 // Extracts are just reads of a subregister, so are free. Inserts are
1046 // considered free because we don't want to have any cost for scalarizing
1047 // operations, and we don't have to copy into a different register class.
1048 return 0;
1049 }
1050 default:
1051 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1052 VIC);
1053 }
1054}
1055
1056/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1057/// this is analyzing the collective result of all output registers. Otherwise,
1058/// this is only querying a specific result index if this returns multiple
1059/// registers in a struct.
1061 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1062 // TODO: Handle complex extract indices
1063 if (Indices.size() > 1)
1064 return true;
1065
1066 const DataLayout &DL = CI->getDataLayout();
1067 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1068 TargetLowering::AsmOperandInfoVector TargetConstraints =
1069 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1070
1071 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1072
1073 int OutputIdx = 0;
1074 for (auto &TC : TargetConstraints) {
1075 if (TC.Type != InlineAsm::isOutput)
1076 continue;
1077
1078 // Skip outputs we don't care about.
1079 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1080 continue;
1081
1082 TLI->ComputeConstraintToUse(TC, SDValue());
1083
1084 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1085 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1086
1087 // For AGPR constraints null is returned on subtargets without AGPRs, so
1088 // assume divergent for null.
1089 if (!RC || !TRI->isSGPRClass(RC))
1090 return true;
1091 }
1092
1093 return false;
1094}
1095
1097 const IntrinsicInst *ReadReg) const {
1098 Metadata *MD =
1099 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1101 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1102
1103 // Special case registers that look like VCC.
1104 MVT VT = MVT::getVT(ReadReg->getType());
1105 if (VT == MVT::i1)
1106 return true;
1107
1108 // Special case scalar registers that start with 'v'.
1109 if (RegName.starts_with("vcc") || RegName.empty())
1110 return false;
1111
1112 // VGPR or AGPR is divergent. There aren't any specially named vector
1113 // registers.
1114 return RegName[0] == 'v' || RegName[0] == 'a';
1115}
1116
1117/// \returns true if the result of the value could potentially be
1118/// different across workitems in a wavefront.
1119bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1120 if (const Argument *A = dyn_cast<Argument>(V))
1122
1123 // Loads from the private and flat address spaces are divergent, because
1124 // threads can execute the load instruction with the same inputs and get
1125 // different results.
1126 //
1127 // All other loads are not divergent, because if threads issue loads with the
1128 // same arguments, they will always get the same result.
1129 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1130 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1131 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1132
1133 // Atomics are divergent because they are executed sequentially: when an
1134 // atomic operation refers to the same address in each thread, then each
1135 // thread after the first sees the value written by the previous thread as
1136 // original value.
1138 return true;
1139
1141 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1142 switch (IID) {
1143 case Intrinsic::read_register:
1145 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1146 unsigned SrcAS =
1147 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1148 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1149 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1150 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1151 ST->hasGloballyAddressableScratch();
1152 }
1153 case Intrinsic::amdgcn_workitem_id_y:
1154 case Intrinsic::amdgcn_workitem_id_z: {
1155 const Function *F = Intrinsic->getFunction();
1156 bool HasUniformYZ =
1157 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1158 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1159 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1160 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1161 }
1162 default:
1164 }
1165 }
1166
1167 // Assume all function calls are a source of divergence.
1168 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1169 if (CI->isInlineAsm())
1171 return true;
1172 }
1173
1174 // Assume all function calls are a source of divergence.
1175 if (isa<InvokeInst>(V))
1176 return true;
1177
1178 // If the target supports globally addressable scratch, the mapping from
1179 // scratch memory to the flat aperture changes therefore an address space cast
1180 // is no longer uniform.
1181 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1182 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1183 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1184 ST->hasGloballyAddressableScratch();
1185 }
1186
1187 return false;
1188}
1189
1190bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1191 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1192 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1193
1194 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1195 if (CI->isInlineAsm())
1197 return false;
1198 }
1199
1200 // In most cases TID / wavefrontsize is uniform.
1201 //
1202 // However, if a kernel has uneven dimesions we can have a value of
1203 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1204 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1205 // packed into a same wave which gives 1 and 0 after the division by 64
1206 // respectively.
1207 //
1208 // The X dimension doesn't reset within a wave if either both the Y
1209 // and Z dimensions are of length 1, or if the X dimension's required
1210 // size is a power of 2. Note, however, if the X dimension's maximum
1211 // size is a power of 2 < the wavefront size, division by the wavefront
1212 // size is guaranteed to yield 0, so this is also a no-reset case.
1213 bool XDimDoesntResetWithinWaves = false;
1214 if (auto *I = dyn_cast<Instruction>(V)) {
1215 const Function *F = I->getFunction();
1216 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1217 }
1218 using namespace llvm::PatternMatch;
1219 uint64_t C;
1221 m_ConstantInt(C))) ||
1223 m_ConstantInt(C)))) {
1224 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1225 }
1226
1227 Value *Mask;
1229 m_Value(Mask)))) {
1230 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1231 ST->getWavefrontSizeLog2() &&
1232 XDimDoesntResetWithinWaves;
1233 }
1234
1235 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1236 if (!ExtValue)
1237 return false;
1238
1239 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1240 if (!CI)
1241 return false;
1242
1243 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1244 switch (Intrinsic->getIntrinsicID()) {
1245 default:
1246 return false;
1247 case Intrinsic::amdgcn_if:
1248 case Intrinsic::amdgcn_else: {
1249 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1250 return Indices.size() == 1 && Indices[0] == 1;
1251 }
1252 }
1253 }
1254
1255 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1256 // divergent for the overall struct return. We need to override it in the
1257 // case we're extracting an SGPR component here.
1258 if (CI->isInlineAsm())
1259 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1260
1261 return false;
1262}
1263
1265 Intrinsic::ID IID) const {
1266 switch (IID) {
1267 case Intrinsic::amdgcn_is_shared:
1268 case Intrinsic::amdgcn_is_private:
1269 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1270 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1271 case Intrinsic::amdgcn_load_to_lds:
1272 case Intrinsic::amdgcn_make_buffer_rsrc:
1273 OpIndexes.push_back(0);
1274 return true;
1275 default:
1276 return false;
1277 }
1278}
1279
1281 Value *OldV,
1282 Value *NewV) const {
1283 auto IntrID = II->getIntrinsicID();
1284 switch (IntrID) {
1285 case Intrinsic::amdgcn_is_shared:
1286 case Intrinsic::amdgcn_is_private: {
1287 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1289 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1290 LLVMContext &Ctx = NewV->getType()->getContext();
1291 ConstantInt *NewVal = (TrueAS == NewAS) ?
1293 return NewVal;
1294 }
1295 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1296 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1297 Type *DestTy = II->getType();
1298 Type *SrcTy = NewV->getType();
1299 unsigned NewAS = SrcTy->getPointerAddressSpace();
1301 return nullptr;
1302 Module *M = II->getModule();
1304 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1305 II->setArgOperand(0, NewV);
1306 II->setCalledFunction(NewDecl);
1307 return II;
1308 }
1309 case Intrinsic::amdgcn_load_to_lds: {
1310 Type *SrcTy = NewV->getType();
1311 Module *M = II->getModule();
1312 Function *NewDecl =
1313 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1314 II->setArgOperand(0, NewV);
1315 II->setCalledFunction(NewDecl);
1316 return II;
1317 }
1318 case Intrinsic::amdgcn_make_buffer_rsrc: {
1319 Type *SrcTy = NewV->getType();
1320 Type *DstTy = II->getType();
1321 Module *M = II->getModule();
1323 M, II->getIntrinsicID(), {DstTy, SrcTy});
1324 II->setArgOperand(0, NewV);
1325 II->setCalledFunction(NewDecl);
1326 return II;
1327 }
1328 default:
1329 return nullptr;
1330 }
1331}
1332
1334 VectorType *DstTy, VectorType *SrcTy,
1335 ArrayRef<int> Mask,
1337 int Index, VectorType *SubTp,
1339 const Instruction *CxtI) const {
1340 if (!isa<FixedVectorType>(SrcTy))
1341 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1342 SubTp);
1343
1344 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1345
1346 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1347 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1348 (ScalarSize == 16 || ScalarSize == 8)) {
1349 // Larger vector widths may require additional instructions, but are
1350 // typically cheaper than scalarized versions.
1351 //
1352 // We assume that shuffling at a register granularity can be done for free.
1353 // This is not true for vectors fed into memory instructions, but it is
1354 // effectively true for all other shuffling. The emphasis of the logic here
1355 // is to assist generic transform in cleaning up / canonicalizing those
1356 // shuffles.
1357
1358 // With op_sel VOP3P instructions freely can access the low half or high
1359 // half of a register, so any swizzle of two elements is free.
1360 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1361 unsigned NumSrcElts = SrcVecTy->getNumElements();
1362 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1363 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1364 Kind == TTI::SK_PermuteSingleSrc))
1365 return 0;
1366 }
1367
1368 unsigned EltsPerReg = 32 / ScalarSize;
1369 switch (Kind) {
1370 case TTI::SK_Broadcast:
1371 // A single v_perm_b32 can be re-used for all destination registers.
1372 return 1;
1373 case TTI::SK_Reverse:
1374 // One instruction per register.
1375 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1376 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1379 if (Index % EltsPerReg == 0)
1380 return 0; // Shuffling at register granularity
1381 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1382 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1385 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1386 if (!DstVecTy)
1388 unsigned NumDstElts = DstVecTy->getNumElements();
1389 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1390 unsigned EndIndex = Index + NumInsertElts;
1391 unsigned BeginSubIdx = Index % EltsPerReg;
1392 unsigned EndSubIdx = EndIndex % EltsPerReg;
1393 unsigned Cost = 0;
1394
1395 if (BeginSubIdx != 0) {
1396 // Need to shift the inserted vector into place. The cost is the number
1397 // of destination registers overlapped by the inserted vector.
1398 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1399 }
1400
1401 // If the last register overlap is partial, there may be three source
1402 // registers feeding into it; that takes an extra instruction.
1403 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1404 Cost += 1;
1405
1406 return Cost;
1407 }
1408 case TTI::SK_Splice: {
1409 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1410 if (!DstVecTy)
1412 unsigned NumElts = DstVecTy->getNumElements();
1413 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1414 // Determine the sub-region of the result vector that requires
1415 // sub-register shuffles / mixing.
1416 unsigned EltsFromLHS = NumElts - Index;
1417 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1418 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1419 if (LHSIsAligned && RHSIsAligned)
1420 return 0;
1421 if (LHSIsAligned && !RHSIsAligned)
1422 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1423 if (!LHSIsAligned && RHSIsAligned)
1424 return divideCeil(EltsFromLHS, EltsPerReg);
1425 return divideCeil(NumElts, EltsPerReg);
1426 }
1427 default:
1428 break;
1429 }
1430
1431 if (!Mask.empty()) {
1432 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1433
1434 // Generically estimate the cost by assuming that each destination
1435 // register is derived from sources via v_perm_b32 instructions if it
1436 // can't be copied as-is.
1437 //
1438 // For each destination register, derive the cost of obtaining it based
1439 // on the number of source registers that feed into it.
1440 unsigned Cost = 0;
1441 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1443 bool Aligned = true;
1444 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1445 int SrcIdx = Mask[DstIdx + I];
1446 if (SrcIdx == -1)
1447 continue;
1448 int Reg;
1449 if (SrcIdx < (int)NumSrcElts) {
1450 Reg = SrcIdx / EltsPerReg;
1451 if (SrcIdx % EltsPerReg != I)
1452 Aligned = false;
1453 } else {
1454 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1455 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1456 Aligned = false;
1457 }
1458 if (!llvm::is_contained(Regs, Reg))
1459 Regs.push_back(Reg);
1460 }
1461 if (Regs.size() >= 2)
1462 Cost += Regs.size() - 1;
1463 else if (!Aligned)
1464 Cost += 1;
1465 }
1466 return Cost;
1467 }
1468 }
1469
1470 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1471 SubTp);
1472}
1473
1474/// Whether it is profitable to sink the operands of an
1475/// Instruction I to the basic block of I.
1476/// This helps using several modifiers (like abs and neg) more often.
1478 SmallVectorImpl<Use *> &Ops) const {
1479 using namespace PatternMatch;
1480
1481 for (auto &Op : I->operands()) {
1482 // Ensure we are not already sinking this operand.
1483 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1484 continue;
1485
1486 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1487 Ops.push_back(&Op);
1488 continue;
1489 }
1490
1491 // Check for zero-cost multiple use InsertElement/ExtractElement
1492 // instructions
1493 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1494 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1495 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1496 if (VecOpInst && VecOpInst->hasOneUse())
1497 continue;
1498
1499 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1501 OpInst->getOperand(0),
1502 OpInst->getOperand(1)) == 0) {
1503 Ops.push_back(&Op);
1504 continue;
1505 }
1506 }
1507 }
1508
1509 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1510
1511 unsigned EltSize = DL.getTypeSizeInBits(
1512 cast<VectorType>(Shuffle->getType())->getElementType());
1513
1514 // For i32 (or greater) shufflevectors, these will be lowered into a
1515 // series of insert / extract elements, which will be coalesced away.
1516 if (EltSize < 16 || !ST->has16BitInsts())
1517 continue;
1518
1519 int NumSubElts, SubIndex;
1520 if (Shuffle->changesLength()) {
1521 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1522 Ops.push_back(&Op);
1523 continue;
1524 }
1525
1526 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1527 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1528 !(SubIndex & 0x1)) {
1529 Ops.push_back(&Op);
1530 continue;
1531 }
1532 }
1533
1534 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1535 Shuffle->isSingleSource()) {
1536 Ops.push_back(&Op);
1537 continue;
1538 }
1539 }
1540 }
1541
1542 return !Ops.empty();
1543}
1544
1546 const Function *Callee) const {
1547 const TargetMachine &TM = getTLI()->getTargetMachine();
1548 const GCNSubtarget *CallerST
1549 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1550 const GCNSubtarget *CalleeST
1551 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1552
1553 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1554 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1555
1556 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1557 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1558 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1559 return false;
1560
1561 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1562 // no way to support merge for backend defined attributes.
1563 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1564 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1565 if (!CallerMode.isInlineCompatible(CalleeMode))
1566 return false;
1567
1568 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1569 Callee->hasFnAttribute(Attribute::InlineHint))
1570 return true;
1571
1572 // Hack to make compile times reasonable.
1573 if (InlineMaxBB) {
1574 // Single BB does not increase total BB amount.
1575 if (Callee->size() == 1)
1576 return true;
1577 size_t BBSize = Caller->size() + Callee->size() - 1;
1578 return BBSize <= InlineMaxBB;
1579 }
1580
1581 return true;
1582}
1583
1585 const SITargetLowering *TLI,
1586 const GCNTTIImpl *TTIImpl) {
1587 const int NrOfSGPRUntilSpill = 26;
1588 const int NrOfVGPRUntilSpill = 32;
1589
1590 const DataLayout &DL = TTIImpl->getDataLayout();
1591
1592 unsigned adjustThreshold = 0;
1593 int SGPRsInUse = 0;
1594 int VGPRsInUse = 0;
1595 for (const Use &A : CB->args()) {
1596 SmallVector<EVT, 4> ValueVTs;
1597 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1598 for (auto ArgVT : ValueVTs) {
1599 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1600 CB->getContext(), CB->getCallingConv(), ArgVT);
1602 SGPRsInUse += CCRegNum;
1603 else
1604 VGPRsInUse += CCRegNum;
1605 }
1606 }
1607
1608 // The cost of passing function arguments through the stack:
1609 // 1 instruction to put a function argument on the stack in the caller.
1610 // 1 instruction to take a function argument from the stack in callee.
1611 // 1 instruction is explicitly take care of data dependencies in callee
1612 // function.
1613 InstructionCost ArgStackCost(1);
1614 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1615 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1617 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1618 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1620
1621 // The penalty cost is computed relative to the cost of instructions and does
1622 // not model any storage costs.
1623 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1624 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1625 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1626 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1627 return adjustThreshold;
1628}
1629
1630static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1631 const DataLayout &DL) {
1632 // If we have a pointer to a private array passed into a function
1633 // it will not be optimized out, leaving scratch usage.
1634 // This function calculates the total size in bytes of the memory that would
1635 // end in scratch if the call was not inlined.
1636 unsigned AllocaSize = 0;
1638 for (Value *PtrArg : CB->args()) {
1639 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1640 if (!Ty)
1641 continue;
1642
1643 unsigned AddrSpace = Ty->getAddressSpace();
1644 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1645 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1646 continue;
1647
1649 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1650 continue;
1651
1652 if (auto Size = AI->getAllocationSize(DL))
1653 AllocaSize += Size->getFixedValue();
1654 }
1655 return AllocaSize;
1656}
1657
1662
1664 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1665
1666 // Private object passed as arguments may end up in scratch usage if the call
1667 // is not inlined. Increase the inline threshold to promote inlining.
1668 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1669 if (AllocaSize > 0)
1670 Threshold += ArgAllocaCost;
1671 return Threshold;
1672}
1673
1675 const AllocaInst *AI) const {
1676
1677 // Below the cutoff, assume that the private memory objects would be
1678 // optimized
1679 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1680 if (AllocaSize <= ArgAllocaCutoff)
1681 return 0;
1682
1683 // Above the cutoff, we give a cost to each private memory object
1684 // depending its size. If the array can be optimized by SROA this cost is not
1685 // added to the total-cost in the inliner cost analysis.
1686 //
1687 // We choose the total cost of the alloca such that their sum cancels the
1688 // bonus given in the threshold (ArgAllocaCost).
1689 //
1690 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1691 //
1692 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1693 // the single-bb bonus and the vector-bonus.
1694 //
1695 // We compensate the first two multipliers, by repeating logic from the
1696 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1697 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1698 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1699
1700 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1701 return BB.getTerminator()->getNumSuccessors() > 1;
1702 });
1703 if (SingleBB) {
1704 Threshold += Threshold / 2;
1705 }
1706
1707 auto ArgAllocaSize = AI->getAllocationSize(DL);
1708 if (!ArgAllocaSize)
1709 return 0;
1710
1711 // Attribute the bonus proportionally to the alloca size
1712 unsigned AllocaThresholdBonus =
1713 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1714
1715 return AllocaThresholdBonus;
1716}
1717
1720 OptimizationRemarkEmitter *ORE) const {
1721 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1722}
1723
1725 TTI::PeelingPreferences &PP) const {
1726 CommonTTI.getPeelingPreferences(L, SE, PP);
1727}
1728
1729int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1730 return getQuarterRateInstrCost(CostKind);
1731}
1732
1733int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1734 return ST->hasFullRate64Ops()
1735 ? getFullRateInstrCost()
1736 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1737 : getQuarterRateInstrCost(CostKind);
1738}
1739
1740std::pair<InstructionCost, MVT>
1741GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1742 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1743 auto Size = DL.getTypeSizeInBits(Ty);
1744 // Maximum load or store can handle 8 dwords for scalar and 4 for
1745 // vector ALU. Let's assume anything above 8 dwords is expensive
1746 // even if legal.
1747 if (Size <= 256)
1748 return Cost;
1749
1750 Cost.first += (Size + 255) / 256;
1751 return Cost;
1752}
1753
1755 return ST->hasPrefetch() ? 128 : 0;
1756}
1757
1760}
1761
1763 const Function &F,
1764 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1765 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1766 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1767 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1768 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1769 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1770 ST->getFlatWorkGroupSizes(F);
1771 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1772 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1773 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1774 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1775 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1776}
1777
1780 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1781 return KnownIEEEMode::On; // Only mode on gfx1170+
1782
1783 const Function *F = I.getFunction();
1784 if (!F)
1786
1787 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1788 if (IEEEAttr.isValid())
1790
1791 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1793}
1794
1796 Align Alignment,
1797 unsigned AddressSpace,
1799 TTI::OperandValueInfo OpInfo,
1800 const Instruction *I) const {
1801 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1802 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1804 VecTy->getElementType()->isIntegerTy(8)) {
1805 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1807 }
1808 }
1809 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1810 OpInfo, I);
1811}
1812
1814 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1815 if (VecTy->getElementType()->isIntegerTy(8)) {
1816 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1817 return divideCeil(ElementCount - 1, 4);
1818 }
1819 }
1820 return BaseT::getNumberOfParts(Tp);
1821}
1822
1825 switch (Intrinsic->getIntrinsicID()) {
1826 case Intrinsic::amdgcn_wave_shuffle:
1828 default:
1829 break;
1830 }
1831 }
1832
1833 if (isAlwaysUniform(V))
1835
1836 if (isSourceOfDivergence(V))
1838
1840}
1841
1843 StackOffset BaseOffset,
1844 bool HasBaseReg, int64_t Scale,
1845 unsigned AddrSpace) const {
1846 if (HasBaseReg && Scale != 0) {
1847 // gfx1250+ can fold base+scale*index when scale matches the memory access
1848 // size (scale_offset bit). Supported for flat/global/constant/scratch
1849 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1850 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1852 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1853 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1854 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1855 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1856 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1857 return 0;
1858 }
1859 return 1;
1860 }
1861 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1862 AddrSpace);
1863}
1864
1866 const TTI::LSRCost &B) const {
1867 // Favor lower per-iteration work over preheader/setup costs.
1868 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1869 // effective instruction count (base+scale*index requires a separate ADD).
1870 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1871 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1872
1873 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1874 A.SetupCost, A.ImmCost, A.NumRegs) <
1875 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1876 B.SetupCost, B.ImmCost, B.NumRegs);
1877}
1878
1880 // isLSRCostLess de-prioritizes register count; keep consistent.
1881 return false;
1882}
1883
1885 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1886 return true;
1887}
1888
1890 const SmallBitVector &UniformArgs) const {
1892 switch (Intrinsic->getIntrinsicID()) {
1893 case Intrinsic::amdgcn_wave_shuffle:
1894 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1895 // is uniform.
1896 return UniformArgs[0] || UniformArgs[1];
1897 default:
1898 llvm_unreachable("unexpected intrinsic in isUniform");
1899 }
1900}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:70
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
ValueUniformity getValueUniformity(const Value *V) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
bool preferSLPInstCountCheck() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:68
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...