LLVM 23.0.0git
GCNSubtarget.h
Go to the documentation of this file.
1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
34 public AMDGPUSubtarget {
35public:
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
64 unsigned Gen = INVALID;
66 int LDSBankCount = 0;
68
69 // Instruction cache line size in bytes; set from TableGen subtarget features.
70 unsigned InstCacheLineSize = 0;
71
72 // Dynamically set bits that enable features.
73 bool DynamicVGPR = false;
75 bool ScalarizeGlobal = false;
76
77 /// The maximum number of instructions that may be placed within an S_CLAUSE,
78 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
79 /// indicates a lack of S_CLAUSE support.
80 unsigned MaxHardClauseLength = 0;
81
82#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
83 bool ATTRIBUTE = DEFAULT;
84#include "AMDGPUGenSubtargetInfo.inc"
85
86private:
87 SIInstrInfo InstrInfo;
88 SITargetLowering TLInfo;
89 SIFrameLowering FrameLowering;
90
91 /// Get the register that represents the actual dependency between the
92 /// definition and the use. The definition might only affect a subregister
93 /// that is not actually used. Works for both virtual and physical registers.
94 /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
95 /// Returns the definition register if there is a real dependency and no
96 /// better match is found.
97 Register getRealSchedDependency(const MachineInstr &DefI, int DefOpIdx,
98 const MachineInstr &UseI, int UseOpIdx) const;
99
100public:
101 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
102 const GCNTargetMachine &TM);
103 ~GCNSubtarget() override;
104
106 StringRef FS);
107
108 /// Diagnose inconsistent subtarget features before attempting to codegen
109 /// function \p F.
110 void checkSubtargetFeatures(const Function &F) const;
111
112 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
113
114 const SIFrameLowering *getFrameLowering() const override {
115 return &FrameLowering;
116 }
117
118 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
119
120 const SIRegisterInfo *getRegisterInfo() const override {
121 return &InstrInfo.getRegisterInfo();
122 }
123
124 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
125
126 const CallLowering *getCallLowering() const override {
127 return CallLoweringInfo.get();
128 }
129
130 const InlineAsmLowering *getInlineAsmLowering() const override {
131 return InlineAsmLoweringInfo.get();
132 }
133
135 return InstSelector.get();
136 }
137
138 const LegalizerInfo *getLegalizerInfo() const override {
139 return Legalizer.get();
140 }
141
142 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
143 return RegBankInfo.get();
144 }
145
147 return TargetID;
148 }
149
151 return &InstrItins;
152 }
153
155
157
158 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
159
160#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
161 bool GETTER() const override { return ATTRIBUTE; }
162#include "AMDGPUGenSubtargetInfo.inc"
163
164 unsigned getMaxWaveScratchSize() const {
165 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
166 if (getGeneration() >= GFX12) {
167 // 18-bit field in units of 64-dword.
168 return (64 * 4) * ((1 << 18) - 1);
169 }
170 if (getGeneration() == GFX11) {
171 // 15-bit field in units of 64-dword.
172 return (64 * 4) * ((1 << 15) - 1);
173 }
174 // 13-bit field in units of 256-dword.
175 return (256 * 4) * ((1 << 13) - 1);
176 }
177
178 /// Return the number of high bits known to be zero for a frame index.
182
183 int getLDSBankCount() const { return LDSBankCount; }
184
185 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
186 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
187
188 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
189 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
190 : 16;
191 }
192
193 unsigned getConstantBusLimit(unsigned Opcode) const;
194
195 /// Returns if the result of this instruction with a 16-bit result returned in
196 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
197 /// the original value.
198 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
199
200 bool supportsWGP() const {
201 if (HasGFX1250Insts)
202 return false;
203 return getGeneration() >= GFX10;
204 }
205
206 bool hasHWFP64() const { return HasFP64; }
207
208 bool hasAddr64() const {
210 }
211
212 bool hasFlat() const {
214 }
215
216 // Return true if the target only has the reverse operand versions of VALU
217 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
218 bool hasOnlyRevVALUShifts() const {
220 }
221
222 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
223
224 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
225
226 bool hasMin3Max3_16() const {
228 }
229
230 bool hasSwap() const { return HasGFX9Insts; }
231
232 bool hasScalarPackInsts() const { return HasGFX9Insts; }
233
234 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
235
236 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
237
238 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
239
243
245 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
246 return getGeneration() >= GFX9;
247 }
248
249 /// True if the offset field of DS instructions works as expected. On SI, the
250 /// offset uses a 16-bit adder and does not always wrap properly.
251 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
252
254 return EnableUnsafeDSOffsetFolding;
255 }
256
257 /// Condition output from div_scale is usable.
261
262 /// Extra wait hazard is needed in some cases before
263 /// s_cbranch_vccnz/s_cbranch_vccz.
264 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
265
266 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
267 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
268
269 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
270 /// was written by a VALU instruction.
273 }
274
275 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
276 /// SGPR was written by a VALU Instruction.
279 }
280
281 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
282
283 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
284 unsigned getSetRegWaitStates() const {
285 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
286 }
287
288 /// Return the amount of LDS that can be used that will not restrict the
289 /// occupancy lower than WaveCount.
290 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
291 const Function &) const;
292
295 }
296
297 /// \returns If target supports S_DENORM_MODE.
298 bool hasDenormModeInst() const {
300 }
301
302 /// \returns If target supports ds_read/write_b128 and user enables generation
303 /// of ds_read/write_b128.
304 bool useDS128() const { return HasCIInsts && EnableDS128; }
305
306 /// \return If target supports ds_read/write_b96/128.
307 bool hasDS96AndDS128() const { return HasCIInsts; }
308
309 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
310 bool haveRoundOpsF64() const { return HasCIInsts; }
311
312 /// \returns If MUBUF instructions always perform range checking, even for
313 /// buffer resources used for private memory access.
317
318 /// \returns If target requires PRT Struct NULL support (zero result registers
319 /// for sparse texture support).
320 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
321
323 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
324 }
325
327 return HasUnalignedDSAccess && HasUnalignedAccessMode;
328 }
329
331 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
332 }
333
334 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
335
336 bool isTgSplitEnabled() const { return EnableTgSplit; }
337
338 bool isCuModeEnabled() const { return EnableCuMode; }
339
340 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
341
342 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
343
344 // Check if target supports ST addressing mode with FLAT scratch instructions.
345 // The ST addressing mode means no registers are used, either VGPR or SGPR,
346 // but only immediate offset is swizzled and added to the FLAT scratch base.
347 bool hasFlatScratchSTMode() const {
348 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
349 }
350
351 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
352
354 return hasArchitectedFlatScratch() ||
355 (EnableFlatScratch && hasFlatScratchInsts());
356 }
357
358 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
359
360 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
361
362 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
363
364 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
365
366 bool hasExportInsts() const {
367 return !hasGFX940Insts() && !hasGFX1250Insts();
368 }
369
370 bool hasVINTERPEncoding() const {
371 return HasGFX11Insts && !hasGFX1250Insts();
372 }
373
374 // DS_ADD_F64/DS_ADD_RTN_F64
375 bool hasLdsAtomicAddF64() const {
376 return hasGFX90AInsts() || hasGFX1250Insts();
377 }
378
380 return getGeneration() >= GFX9;
381 }
382
383 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
384
385 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
386
388 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
389 }
390
391 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
392
393 /// Return if most LDS instructions have an m0 use that require m0 to be
394 /// initialized.
395 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
396
397 // True if the hardware rewinds and replays GWS operations if a wave is
398 // preempted.
399 //
400 // If this is false, a GWS operation requires testing if a nack set the
401 // MEM_VIOL bit, and repeating if so.
402 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
403
404 /// \returns if target has ds_gws_sema_release_all instruction.
405 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
406
407 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
408
409 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
410
411 // Covers VS/PS/CS graphics shaders
412 bool isMesaGfxShader(const Function &F) const {
413 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
414 }
415
416 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
417
418 bool hasAtomicFaddInsts() const {
419 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
420 }
421
423 return getGeneration() < SEA_ISLANDS;
424 }
425
426 bool hasInstPrefetch() const {
427 return getGeneration() == GFX10 || getGeneration() == GFX11;
428 }
429
430 bool hasPrefetch() const { return HasGFX12Insts; }
431
432 // Has s_cmpk_* instructions.
433 bool hasSCmpK() const { return getGeneration() < GFX12; }
434
435 // Scratch is allocated in 256 dword per wave blocks for the entire
436 // wavefront. When viewed from the perspective of an arbitrary workitem, this
437 // is 4-byte aligned.
438 //
439 // Only 4-byte alignment is really needed to access anything. Transformations
440 // on the pointer value itself may rely on the alignment / known low bits of
441 // the pointer. Set this to something above the minimum to avoid needing
442 // dynamic realignment in common cases.
443 Align getStackAlignment() const { return Align(16); }
444
445 bool enableMachineScheduler() const override { return true; }
446
447 bool useAA() const override;
448
449 bool enableSubRegLiveness() const override { return true; }
450
453
454 // static wrappers
455 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
456
457 // XXX - Why is this here if it isn't in the default pass set?
458 bool enableEarlyIfConversion() const override { return true; }
459
461 const SchedRegion &Region) const override;
462
464 const SchedRegion &Region) const override;
465
466 void mirFileLoaded(MachineFunction &MF) const override;
467
468 unsigned getMaxNumUserSGPRs() const {
469 return AMDGPU::getMaxNumUserSGPRs(*this);
470 }
471
472 bool useVGPRIndexMode() const;
473
474 bool hasScalarCompareEq64() const {
476 }
477
478 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
479 bool hasLDSFPAtomicAddF64() const {
480 return HasGFX90AInsts || HasGFX1250Insts;
481 }
482
483 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
484 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
485
486 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
487 bool hasPermLane64() const { return getGeneration() >= GFX11; }
488
489 bool hasDPPRowShare() const {
490 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
491 }
492
493 // Has V_PK_MOV_B32 opcode
494 bool hasPkMovB32() const { return HasGFX90AInsts; }
495
497 return getGeneration() >= GFX10 || hasGFX940Insts();
498 }
499
500 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
501
502 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
503
504 unsigned getNSAMaxSize(bool HasSampler = false) const {
505 return AMDGPU::getNSAMaxSize(*this, HasSampler);
506 }
507
508 bool hasMadF16() const;
509
510 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
511
512 // Scalar and global loads support scale_offset bit.
513 bool hasScaleOffset() const { return HasGFX1250Insts; }
514
515 // FLAT GLOBAL VOffset is signed
516 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
517
519
521 return HasUserSGPRInit16Bug && isWave32();
522 }
523
527
528 // \returns true if the subtarget supports DWORDX3 load/store instructions.
529 bool hasDwordx3LoadStores() const { return HasCIInsts; }
530
534
539
542 }
543
546 }
547
549 return HasLDSMisalignedBug && !EnableCuMode;
550 }
551
552 // Shift amount of a 64 bit shift cannot be a highest allocated register
553 // if also at the end of the allocation block.
554 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
555
556 // Has one cycle hazard on transcendental instruction feeding a
557 // non transcendental VALU.
558 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
559
560 // Has one cycle hazard on a VALU instruction partially writing dst with
561 // a shift of result bits feeding another VALU instruction.
562 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
563
564 // Cannot use op_sel with v_dot instructions.
565 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
566
567 // Does not have HW interlocs for VALU writing and then reading SGPRs.
568 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
569
570 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
571
573 return getGeneration() == GFX10;
574 }
575
576 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
577
578 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
579
580 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
581
583 return getGeneration() == GFX11;
584 }
585
586 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
587
588 // All GFX9 targets experience a fetch delay when an instruction at the start
589 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
590 // is uniquely sensitive to this: the delay triggers further performance
591 // degradation beyond the fetch latency itself.
592 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
593
594 bool requiresCodeObjectV6() const { return RequiresCOV6; }
595
596 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
597
598 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
599
601 return HasGFX12Insts && !HasGFX1250Insts;
602 }
603
604 bool setRegModeNeedsVNOPs() const {
605 return HasGFX1250Insts && getGeneration() == GFX12;
606 }
607
608 /// Return if operations acting on VGPR tuples require even alignment.
609 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
610
611 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
612 bool hasSPackHL() const { return HasGFX11Insts; }
613
614 /// Return true if the target's EXP instruction has the COMPR flag, which
615 /// affects the meaning of the EN (enable) bits.
616 bool hasCompressedExport() const { return !HasGFX11Insts; }
617
618 /// Return true if the target's EXP instruction supports the NULL export
619 /// target.
620 bool hasNullExportTarget() const { return !HasGFX11Insts; }
621
622 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
623
624 /// Return true if the target has the S_DELAY_ALU instruction.
625 bool hasDelayAlu() const { return HasGFX11Insts; }
626
627 /// Returns true if the target supports
628 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
629 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
630 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
631
632 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
633 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
634 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
635
636 /// \returns true if the target has packed f32 instructions that only read 32
637 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
638 /// both channels.
640 return getGeneration() == GFX12 && HasGFX1250Insts;
641 }
642
643 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
644
645 /// \returns true if the target supports expert scheduling mode 2 which relies
646 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
647 /// instructions in some instances.
648 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
649
650 /// \returns The maximum number of instructions that can be enclosed in an
651 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
652 /// instruction.
653 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
654
655 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
656 /// SGPRs
657 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
658
659 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
660 /// VGPRs
661 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
662 unsigned DynamicVGPRBlockSize) const;
663
664 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
665 /// be achieved when the only function running on a CU is \p F, each workgroup
666 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
667 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
668 /// range, so this returns a range as well.
669 ///
670 /// Note that occupancy can be affected by the scratch allocation as well, but
671 /// we do not have enough information to compute it.
672 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
673 unsigned LDSSize = 0,
674 unsigned NumSGPRs = 0,
675 unsigned NumVGPRs = 0) const;
676
677 /// \returns true if the flat_scratch register should be initialized with the
678 /// pointer to the wave's scratch memory rather than a size and offset.
679 bool flatScratchIsPointer() const {
681 }
682
683 /// \returns true if the machine has merged shaders in which s0-s7 are
684 /// reserved by the hardware and user SGPRs start at s8
685 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
686
687 // \returns true if the target supports the pre-NGG legacy geometry path.
688 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
689
690 // \returns true if the target has split barriers feature
691 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
692
693 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
694 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
695
696 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
697 /// values.
698 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
699
700 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
701
702 bool hasVOPD3() const { return HasGFX1250Insts; }
703
704 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
705 bool hasVectorMulU64() const { return HasGFX1250Insts; }
706
707 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
708 // instructions.
709 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
710
711 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
712 bool hasIntMinMax64() const { return HasGFX1250Insts; }
713
714 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
715 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
716
717 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
718 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
719
720 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
721 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
722 // extended VA to 57 bits.
724 return HasGFX12Insts && !HasGFX1250Insts;
725 }
726
727 // \returns true if the target needs to create a prolog for backward
728 // compatibility when preloading kernel arguments.
730 return hasKernargPreload() && !HasGFX1250Insts;
731 }
732
733 bool hasCondSubInsts() const { return HasGFX12Insts; }
734
735 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
736
737 /// \returns SGPR allocation granularity supported by the subtarget.
738 unsigned getSGPRAllocGranule() const {
740 }
741
742 /// \returns SGPR encoding granularity supported by the subtarget.
743 unsigned getSGPREncodingGranule() const {
745 }
746
747 /// \returns Total number of SGPRs supported by the subtarget.
748 unsigned getTotalNumSGPRs() const {
750 }
751
752 /// \returns Addressable number of SGPRs supported by the subtarget.
753 unsigned getAddressableNumSGPRs() const {
755 }
756
757 /// \returns Minimum number of SGPRs that meets the given number of waves per
758 /// execution unit requirement supported by the subtarget.
759 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
760 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
761 }
762
763 /// \returns Maximum number of SGPRs that meets the given number of waves per
764 /// execution unit requirement supported by the subtarget.
765 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
766 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
767 }
768
769 /// \returns Reserved number of SGPRs. This is common
770 /// utility function called by MachineFunction and
771 /// Function variants of getReservedNumSGPRs.
772 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
773 /// \returns Reserved number of SGPRs for given machine function \p MF.
774 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
775
776 /// \returns Reserved number of SGPRs for given function \p F.
777 unsigned getReservedNumSGPRs(const Function &F) const;
778
779 /// \returns Maximum number of preloaded SGPRs for the subtarget.
780 unsigned getMaxNumPreloadedSGPRs() const;
781
782 /// \returns max num SGPRs. This is the common utility
783 /// function called by MachineFunction and Function
784 /// variants of getMaxNumSGPRs.
785 unsigned getBaseMaxNumSGPRs(const Function &F,
786 std::pair<unsigned, unsigned> WavesPerEU,
787 unsigned PreloadedSGPRs,
788 unsigned ReservedNumSGPRs) const;
789
790 /// \returns Maximum number of SGPRs that meets number of waves per execution
791 /// unit requirement for function \p MF, or number of SGPRs explicitly
792 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
793 ///
794 /// \returns Value that meets number of waves per execution unit requirement
795 /// if explicitly requested value cannot be converted to integer, violates
796 /// subtarget's specifications, or does not meet number of waves per execution
797 /// unit requirement.
798 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
799
800 /// \returns Maximum number of SGPRs that meets number of waves per execution
801 /// unit requirement for function \p F, or number of SGPRs explicitly
802 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
803 ///
804 /// \returns Value that meets number of waves per execution unit requirement
805 /// if explicitly requested value cannot be converted to integer, violates
806 /// subtarget's specifications, or does not meet number of waves per execution
807 /// unit requirement.
808 unsigned getMaxNumSGPRs(const Function &F) const;
809
810 /// \returns VGPR allocation granularity supported by the subtarget.
811 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
812 return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
813 }
814
815 /// \returns VGPR encoding granularity supported by the subtarget.
816 unsigned getVGPREncodingGranule() const {
818 }
819
820 /// \returns Total number of VGPRs supported by the subtarget.
821 unsigned getTotalNumVGPRs() const {
823 }
824
825 /// \returns Addressable number of architectural VGPRs supported by the
826 /// subtarget.
830
831 /// \returns Addressable number of VGPRs supported by the subtarget.
832 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
833 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
834 }
835
836 /// \returns the minimum number of VGPRs that will prevent achieving more than
837 /// the specified number of waves \p WavesPerEU.
838 unsigned getMinNumVGPRs(unsigned WavesPerEU,
839 unsigned DynamicVGPRBlockSize) const {
840 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
841 DynamicVGPRBlockSize);
842 }
843
844 /// \returns the maximum number of VGPRs that can be used and still achieved
845 /// at least the specified number of waves \p WavesPerEU.
846 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
847 unsigned DynamicVGPRBlockSize) const {
848 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
849 DynamicVGPRBlockSize);
850 }
851
852 /// \returns max num VGPRs. This is the common utility function
853 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
854 unsigned
856 std::pair<unsigned, unsigned> NumVGPRBounds) const;
857
858 /// \returns Maximum number of VGPRs that meets number of waves per execution
859 /// unit requirement for function \p F, or number of VGPRs explicitly
860 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
861 ///
862 /// \returns Value that meets number of waves per execution unit requirement
863 /// if explicitly requested value cannot be converted to integer, violates
864 /// subtarget's specifications, or does not meet number of waves per execution
865 /// unit requirement.
866 unsigned getMaxNumVGPRs(const Function &F) const;
867
868 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
869
870 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
871 /// of waves per execution unit required for the function \p MF.
872 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
873
874 /// \returns Maximum number of VGPRs that meets number of waves per execution
875 /// unit requirement for function \p MF, or number of VGPRs explicitly
876 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
877 ///
878 /// \returns Value that meets number of waves per execution unit requirement
879 /// if explicitly requested value cannot be converted to integer, violates
880 /// subtarget's specifications, or does not meet number of waves per execution
881 /// unit requirement.
882 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
883
884 bool supportsWave32() const { return getGeneration() >= GFX10; }
885
886 bool supportsWave64() const { return !hasGFX1250Insts(); }
887
888 bool isWave32() const { return getWavefrontSize() == 32; }
889
890 bool isWave64() const { return getWavefrontSize() == 64; }
891
892 /// Returns if the wavesize of this subtarget is known reliable. This is false
893 /// only for the a default target-cpu that does not have an explicit
894 /// +wavefrontsize target feature.
895 bool isWaveSizeKnown() const {
896 return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
897 hasFeature(AMDGPU::FeatureWavefrontSize64);
898 }
899
901 return getRegisterInfo()->getBoolRC();
902 }
903
904 /// \returns Maximum number of work groups per compute unit supported by the
905 /// subtarget and limited by given \p FlatWorkGroupSize.
906 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
907 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
908 }
909
910 /// \returns Minimum flat work group size supported by the subtarget.
911 unsigned getMinFlatWorkGroupSize() const override {
913 }
914
915 /// \returns Maximum flat work group size supported by the subtarget.
916 unsigned getMaxFlatWorkGroupSize() const override {
918 }
919
920 /// \returns Number of waves per execution unit required to support the given
921 /// \p FlatWorkGroupSize.
922 unsigned
923 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
924 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
925 }
926
927 /// \returns Minimum number of waves per execution unit supported by the
928 /// subtarget.
929 unsigned getMinWavesPerEU() const override {
931 }
932
933 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
934 SDep &Dep,
935 const TargetSchedModel *SchedModel) const override;
936
937 // \returns true if it's beneficial on this subtarget for the scheduler to
938 // cluster stores as well as loads.
939 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
940
941 // \returns the number of address arguments from which to enable MIMG NSA
942 // on supported architectures.
943 unsigned getNSAThreshold(const MachineFunction &MF) const;
944
945 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
946 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
947 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
948
949 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
950 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
951 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
952
953 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
954 unsigned getDynamicVGPRBlockSize() const {
955 return DynamicVGPRBlockSize32 ? 32 : 16;
956 }
957
959 // AMDGPU doesn't care if early-clobber and undef operands are allocated
960 // to the same register.
961 return false;
962 }
963
964 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
965 // and surronded by S_WAIT_ALU(0xFFE3).
967 return getGeneration() == GFX12;
968 }
969
970 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
971 // read.
973 return HasGFX1250Insts && getGeneration() == GFX12;
974 }
975
976 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
977 // result.
979 return HasGFX1250Insts && getGeneration() == GFX12;
980 }
981
982 /// \returns true if the subtarget requires a wait for xcnt before VMEM
983 /// accesses that must never be repeated in the event of a page fault/re-try.
984 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
986 return HasGFX1250Insts;
987 }
988
989 /// \returns the number of significant bits in the immediate field of the
990 /// S_NOP instruction.
991 unsigned getSNopBits() const {
993 return 7;
995 return 4;
996 return 3;
997 }
998
999 bool supportsBPermute() const {
1001 }
1002
1004 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1006 isWave32();
1007 }
1008
1009 /// Return true if real (non-fake) variants of True16 instructions using
1010 /// 16-bit registers should be code-generated. Fake True16 instructions are
1011 /// identical to non-fake ones except that they take 32-bit registers as
1012 /// operands and always use their low halves.
1013 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1014 // supported and the support for fake True16 instructions is removed.
1015 bool useRealTrue16Insts() const {
1016 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1017 }
1018
1020 return getGeneration() >= GFX10 || isTgSplitEnabled();
1021 }
1022};
1023
1025public:
1026 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1027
1028 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1029
1030 bool hasDispatchPtr() const { return DispatchPtr; }
1031
1032 bool hasQueuePtr() const { return QueuePtr; }
1033
1034 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1035
1036 bool hasDispatchID() const { return DispatchID; }
1037
1038 bool hasFlatScratchInit() const { return FlatScratchInit; }
1039
1040 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1041
1042 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1043
1044 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1045
1046 unsigned getNumFreeUserSGPRs();
1047
1048 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1049
1060
1061 // Returns the size in number of SGPRs for preload user SGPR field.
1063 switch (ID) {
1065 return 2;
1067 return 4;
1068 case DispatchPtrID:
1069 return 2;
1070 case QueuePtrID:
1071 return 2;
1073 return 2;
1074 case DispatchIdID:
1075 return 2;
1076 case FlatScratchInitID:
1077 return 2;
1079 return 1;
1080 }
1081 llvm_unreachable("Unknown UserSGPRID.");
1082 }
1083
1084 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1085
1086private:
1087 const GCNSubtarget &ST;
1088
1089 // Private memory buffer
1090 // Compute directly in sgpr[0:1]
1091 // Other shaders indirect 64-bits at sgpr[0:1]
1092 bool ImplicitBufferPtr = false;
1093
1094 bool PrivateSegmentBuffer = false;
1095
1096 bool DispatchPtr = false;
1097
1098 bool QueuePtr = false;
1099
1100 bool KernargSegmentPtr = false;
1101
1102 bool DispatchID = false;
1103
1104 bool FlatScratchInit = false;
1105
1106 bool PrivateSegmentSize = false;
1107
1108 unsigned NumKernargPreloadSGPRs = 0;
1109
1110 unsigned NumUsedUserSGPRs = 0;
1111};
1112
1113} // end namespace llvm
1114
1115#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:54
Promote Memory to Register
Definition Mem2Reg.cpp:110
SI DAG Lowering interface definition.
Interface definition for SIInstrInfo.
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned getMaxWavesPerEU() const
unsigned getWavefrontSize() const
bool hasPrefetch() const
bool hasFlat() const
bool hasD16Images() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
bool partialVCCWritesUpdateVCCZ() const
Writes to VCC_LO/VCC_HI update the VCCZ flag.
bool hasSwap() const
bool hasPkMinMax3Insts() const
bool hasD16LoadStore() const
bool hasMergedShaders() const
bool hasRrWGMode() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasOnlyRevVALUShifts() const
bool hasNonNSAEncoding() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasExpertSchedulingMode() const
void mirFileLoaded(MachineFunction &MF) const override
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool loadStoreOptEnabled() const
bool enableSubRegLiveness() const override
unsigned getSGPRAllocGranule() const
bool hasLdsAtomicAddF64() const
bool hasFlatLgkmVMemCountInOrder() const
bool flatScratchIsPointer() const
bool requiresWaitOnWorkgroupReleaseFence() const
bool hasShift64HighRegBug() const
unsigned MaxPrivateElementSize
bool unsafeDSOffsetFoldingEnabled() const
bool hasFPAtomicToDenormModeHazard() const
unsigned getAddressableNumArchVGPRs() const
bool vmemWriteNeedsExpWaitcnt() const
bool shouldClusterStores() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasUserSGPRInit16BugInWave32() const
unsigned getSGPREncodingGranule() const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasFlatScratchHiInB64InstHazard() const
bool hasDstSelForwardingHazard() const
void setScalarizeGlobalBehavior(bool b)
bool hasFlatScratchEnabled() const
unsigned getSNopBits() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasMultiDwordFlatScratchAddressing() const
bool hasFmaakFmamkF64Insts() const
bool hasHWFP64() const
bool hasScaleOffset() const
bool hasDenormModeInst() const
bool hasCvtScaleForwardingHazard() const
unsigned getTotalNumVGPRs() const
unsigned getMinWavesPerEU() const override
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasCondSubInsts() const
const InlineAsmLowering * getInlineAsmLowering() const override
unsigned getTotalNumSGPRs() const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Align getStackAlignment() const
bool privateMemoryResourceIsRangeChecked() const
bool hasScalarSubwordLoads() const
bool hasMadF16() const
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
bool hasLoopHeadInstSplitSensitivity() const
bool hasDwordx3LoadStores() const
bool hasSignedScratchOffsets() const
bool hasGlobalAddTidInsts() const
bool hasFlatScrRegister() const
bool hasGetPCZeroExtension() const
bool hasPermLane64() const
bool requiresNopBeforeDeallocVGPRs() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool supportsGetDoorbellID() const
bool supportsWave32() const
bool isTgSplitEnabled() const
unsigned getMaxNumAGPRs(const Function &F) const
bool hasReadM0MovRelInterpHazard() const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasDOTOpSelHazard() const
bool hasLdsWaitVMSRC() const
const TargetRegisterClass * getBoolRC() const
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool hasFmaakFmamkF32Insts() const
bool hasMad64_32() const
InstructionSelector * getInstructionSelector() const override
unsigned getVGPREncodingGranule() const
bool hasHardClauses() const
bool useDS128() const
bool hasExtendedWaitCounts() const
bool d16PreservesUnusedBits() const
bool hasInstPrefetch() const
bool hasAddPC64Inst() const
unsigned maxHardClauseLength() const
bool isMesaGfxShader(const Function &F) const
bool hasExportInsts() const
bool hasVINTERPEncoding() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
bool hasLegacyGeometry() const
TrapHandlerAbi getTrapHandlerAbi() const
bool isCuModeEnabled() const
const SIFrameLowering * getFrameLowering() const override
bool hasDPPRowShare() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool useVGPRBlockOpsForCSR() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool needsKernArgPreloadProlog() const
bool hasMin3Max3_16() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const
bool hasMadU64U32NoCarry() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
const SITargetLowering * getTargetLowering() const override
bool hasTransForwardingHazard() const
bool enableMachineScheduler() const override
bool hasLDSFPAtomicAddF64() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool getScalarizeGlobalBehavior() const
bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const
bool hasReadM0LdsDmaHazard() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasScratchBaseForwardingHazard() const
bool hasIntMinMax64() const
bool hasScalarPackInsts() const
bool requiresDisjointEarlyClobberAndUndef() const override
bool hasVALUReadSGPRHazard() const
bool usePRTStrictNull() const
bool hasMovB64() const
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const
bool supportsWaveWideBPermute() const
bool hasMed3_16() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasUnalignedScratchAccessEnabled() const
bool hasNullExportTarget() const
Return true if the target's EXP instruction supports the NULL export target.
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool useAA() const override
bool isWave32() const
bool isGFX11Plus() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool hasUnalignedBufferAccessEnabled() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getMinFlatWorkGroupSize() const override
bool hasAsyncMark() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool supportsMinMaxDenormModes() const
bool supportsWave64() const
bool supportsBPermute() const
bool hasFormattedMUBUFInsts() const
bool hasFlatScratchSVSMode() const
unsigned InstCacheLineSize
bool supportsWGP() const
bool hasAtomicFaddInsts() const
bool hasSubClampInsts() const
bool requiresWaitXCntForSingleAccessInstructions() const
unsigned getNSAMaxSize(bool HasSampler=false) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool hasVOP3DPP() const
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMaxNumUserSGPRs() const
unsigned MaxHardClauseLength
The maximum number of instructions that may be placed within an S_CLAUSE, which is one greater than t...
bool hasPermLaneX16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasVDecCoExecHazard() const
bool hasSignedGVSOffset() const
bool hasLDSFPAtomicAddF32() const
unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDelayAlu() const
Return true if the target has the S_DELAY_ALU instruction.
bool hasReadM0SendMsgHazard() const
bool hasVectorMulU64() const
bool hasScalarMulHiInsts() const
bool hasSCmpK() const
const LegalizerInfo * getLegalizerInfo() const override
bool requiresWaitIdleBeforeGetReg() const
bool hasDS96AndDS128() const
bool hasReadM0LdsDirectHazard() const
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI)
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
bool hasScalarAddSub64() const
bool hasSplitBarriers() const
bool enableEarlyIfConversion() const override
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasSGetShaderCyclesInst() const
bool hasINVWBL2WaitCntRequirement() const
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasFlatScratchSTMode() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasGWSSemaReleaseAll() const
bool hasAddr64() const
unsigned getAddressableNumSGPRs() const
bool hasReadVCCZBug() const
Extra wait hazard is needed in some cases before s_cbranch_vccnz/s_cbranch_vccz.
bool isWave64() const
unsigned getDynamicVGPRBlockSize() const
bool setRegModeNeedsVNOPs() const
bool hasFractBug() const
bool isPreciseMemoryEnabled() const
unsigned getMaxWaveScratchSize() const
bool hasLDSMisalignedBugInWGPMode() const
bool hasMTBUFInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool hasVOPD3() const
bool hasAtomicCSub() const
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
bool requiresCodeObjectV6() const
const CallLowering * getCallLowering() const override
bool hasLdsDirect() const
bool hasGWSAutoReplay() const
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
unsigned getNumKernargPreloadSGPRs() const
unsigned getNumUsedUserSGPRs() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Itinerary data supplied by a subtarget to be used by a target.
Scheduling dependency.
Definition ScheduleDAG.h:51
const TargetRegisterClass * getBoolRC() const
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
constexpr unsigned getMaxFlatWorkGroupSize()
unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI)
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
unsigned getMinWavesPerEU(const MCSubtargetInfo *STI)
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI)
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
A region of an MBB for scheduling.