LLVM 23.0.0git
GCNSubtarget.h
Go to the documentation of this file.
1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
35 public AMDGPUSubtarget {
36public:
38
39 // Following 2 enums are documented at:
40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41 enum class TrapHandlerAbi {
42 NONE = 0x00,
43 AMDHSA = 0x01,
44 };
45
46 enum class TrapID {
49 };
50
51private:
52 /// SelectionDAGISel related APIs.
53 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
54
55 /// GlobalISel related APIs.
56 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
57 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
58 std::unique_ptr<InstructionSelector> InstSelector;
59 std::unique_ptr<LegalizerInfo> Legalizer;
60 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
61
62protected:
63 // Basic subtarget description.
65 unsigned Gen = INVALID;
67 int LDSBankCount = 0;
69
70 // Instruction cache line size in bytes; set from TableGen subtarget features.
71 unsigned InstCacheLineSize = 0;
72
73 // Dynamically set bits that enable features.
74 bool DynamicVGPR = false;
76 bool ScalarizeGlobal = false;
77
78 /// The maximum number of instructions that may be placed within an S_CLAUSE,
79 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
80 /// indicates a lack of S_CLAUSE support.
81 unsigned MaxHardClauseLength = 0;
82
83#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
84 bool ATTRIBUTE = DEFAULT;
85#include "AMDGPUGenSubtargetInfo.inc"
86
87private:
88 SIInstrInfo InstrInfo;
89 SITargetLowering TLInfo;
90 SIFrameLowering FrameLowering;
91
92 /// Get the register that represents the actual dependency between the
93 /// definition and the use. The definition might only affect a subregister
94 /// that is not actually used. Works for both virtual and physical registers.
95 /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
96 /// Returns the definition register if there is a real dependency and no
97 /// better match is found.
98 Register getRealSchedDependency(const MachineInstr &DefI, int DefOpIdx,
99 const MachineInstr &UseI, int UseOpIdx) const;
100
101public:
102 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
103 const GCNTargetMachine &TM);
104 ~GCNSubtarget() override;
105
107 StringRef FS);
108
109 /// Diagnose inconsistent subtarget features before attempting to codegen
110 /// function \p F.
111 void checkSubtargetFeatures(const Function &F) const;
112
113 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
114
115 const SIFrameLowering *getFrameLowering() const override {
116 return &FrameLowering;
117 }
118
119 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
120
121 const SIRegisterInfo *getRegisterInfo() const override {
122 return &InstrInfo.getRegisterInfo();
123 }
124
125 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
126
127 const CallLowering *getCallLowering() const override {
128 return CallLoweringInfo.get();
129 }
130
131 const InlineAsmLowering *getInlineAsmLowering() const override {
132 return InlineAsmLoweringInfo.get();
133 }
134
136 return InstSelector.get();
137 }
138
139 const LegalizerInfo *getLegalizerInfo() const override {
140 return Legalizer.get();
141 }
142
143 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
144 return RegBankInfo.get();
145 }
146
148 return TargetID;
149 }
150
152 return &InstrItins;
153 }
154
156
158
159 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
160
161#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
162 bool GETTER() const override { return ATTRIBUTE; }
163#include "AMDGPUGenSubtargetInfo.inc"
164
165 unsigned getMaxWaveScratchSize() const {
166 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
167 if (getGeneration() >= GFX12) {
168 // 18-bit field in units of 64-dword.
169 return (64 * 4) * ((1 << 18) - 1);
170 }
171 if (getGeneration() == GFX11) {
172 // 15-bit field in units of 64-dword.
173 return (64 * 4) * ((1 << 15) - 1);
174 }
175 // 13-bit field in units of 256-dword.
176 return (256 * 4) * ((1 << 13) - 1);
177 }
178
179 /// Return the number of high bits known to be zero for a frame index.
183
184 int getLDSBankCount() const { return LDSBankCount; }
185
186 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
187 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
188
189 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
190 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
191 : 16;
192 }
193
194 unsigned getConstantBusLimit(unsigned Opcode) const;
195
196 /// Returns if the result of this instruction with a 16-bit result returned in
197 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
198 /// the original value.
199 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
200
201 bool supportsWGP() const {
202 if (HasGFX1250Insts)
203 return false;
204 return getGeneration() >= GFX10;
205 }
206
207 bool hasHWFP64() const { return HasFP64; }
208
209 bool hasAddr64() const {
211 }
212
213 bool hasFlat() const {
215 }
216
217 // Return true if the target only has the reverse operand versions of VALU
218 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
219 bool hasOnlyRevVALUShifts() const {
221 }
222
223 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
224
225 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
226
227 bool hasMin3Max3_16() const {
229 }
230
231 bool hasSwap() const { return HasGFX9Insts; }
232
233 bool hasScalarPackInsts() const { return HasGFX9Insts; }
234
235 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
236
237 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
238
239 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
240
244
246 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
247 return getGeneration() >= GFX9;
248 }
249
250 /// True if the offset field of DS instructions works as expected. On SI, the
251 /// offset uses a 16-bit adder and does not always wrap properly.
252 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
253
255 return EnableUnsafeDSOffsetFolding;
256 }
257
258 /// Condition output from div_scale is usable.
262
263 /// Extra wait hazard is needed in some cases before
264 /// s_cbranch_vccnz/s_cbranch_vccz.
265 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
266
267 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
268 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
269
270 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
271 /// was written by a VALU instruction.
274 }
275
276 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
277 /// SGPR was written by a VALU Instruction.
280 }
281
282 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
283
284 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
285 unsigned getSetRegWaitStates() const {
286 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
287 }
288
289 /// Return the amount of LDS that can be used that will not restrict the
290 /// occupancy lower than WaveCount.
291 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
292 const Function &) const;
293
296 }
297
298 /// \returns If target supports S_DENORM_MODE.
299 bool hasDenormModeInst() const {
301 }
302
303 /// \returns If target supports ds_read/write_b128 and user enables generation
304 /// of ds_read/write_b128.
305 bool useDS128() const { return HasCIInsts && EnableDS128; }
306
307 /// \return If target supports ds_read/write_b96/128.
308 bool hasDS96AndDS128() const { return HasCIInsts; }
309
310 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
311 bool haveRoundOpsF64() const { return HasCIInsts; }
312
313 /// \returns If MUBUF instructions always perform range checking, even for
314 /// buffer resources used for private memory access.
318
319 /// \returns If target requires PRT Struct NULL support (zero result registers
320 /// for sparse texture support).
321 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
322
324 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
325 }
326
328 return HasUnalignedDSAccess && HasUnalignedAccessMode;
329 }
330
332 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
333 }
334
335 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
336
337 bool isTgSplitEnabled() const { return EnableTgSplit; }
338
339 bool isCuModeEnabled() const { return EnableCuMode; }
340
341 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
342
343 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
344
345 // Check if target supports ST addressing mode with FLAT scratch instructions.
346 // The ST addressing mode means no registers are used, either VGPR or SGPR,
347 // but only immediate offset is swizzled and added to the FLAT scratch base.
348 bool hasFlatScratchSTMode() const {
349 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
350 }
351
352 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
353
355 return hasArchitectedFlatScratch() ||
356 (EnableFlatScratch && hasFlatScratchInsts());
357 }
358
359 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
360
361 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
362
363 bool hasExportInsts() const {
364 return !hasGFX940Insts() && !hasGFX1250Insts();
365 }
366
367 bool hasVINTERPEncoding() const {
368 return HasGFX11Insts && !hasGFX1250Insts();
369 }
370
371 // DS_ADD_F64/DS_ADD_RTN_F64
372 bool hasLdsAtomicAddF64() const {
373 return hasGFX90AInsts() || hasGFX1250Insts();
374 }
375
377 return getGeneration() >= GFX9;
378 }
379
380 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
381
382 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
383
385 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
386 }
387
388 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
389
390 /// Return if most LDS instructions have an m0 use that require m0 to be
391 /// initialized.
392 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
393
394 // True if the hardware rewinds and replays GWS operations if a wave is
395 // preempted.
396 //
397 // If this is false, a GWS operation requires testing if a nack set the
398 // MEM_VIOL bit, and repeating if so.
399 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
400
401 /// \returns if target has ds_gws_sema_release_all instruction.
402 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
403
404 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
405
406 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
407
408 // Covers VS/PS/CS graphics shaders
409 bool isMesaGfxShader(const Function &F) const {
410 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
411 }
412
413 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
414
415 bool hasAtomicFaddInsts() const {
416 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
417 }
418
420 return getGeneration() < SEA_ISLANDS;
421 }
422
423 bool hasInstPrefetch() const {
424 return getGeneration() == GFX10 || getGeneration() == GFX11;
425 }
426
427 bool hasPrefetch() const { return HasGFX12Insts; }
428
429 bool hasInstPrefSize() const { return isGFX11Plus(); }
430
431 void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
432 uint32_t &CacheLineSize) const {
435 if (getGeneration() == GFX11) {
436 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
437 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
438 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
439 } else {
440 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
441 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
442 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
443 }
444 }
445
446 // Has s_cmpk_* instructions.
447 bool hasSCmpK() const { return getGeneration() < GFX12; }
448
449 // Scratch is allocated in 256 dword per wave blocks for the entire
450 // wavefront. When viewed from the perspective of an arbitrary workitem, this
451 // is 4-byte aligned.
452 //
453 // Only 4-byte alignment is really needed to access anything. Transformations
454 // on the pointer value itself may rely on the alignment / known low bits of
455 // the pointer. Set this to something above the minimum to avoid needing
456 // dynamic realignment in common cases.
457 Align getStackAlignment() const { return Align(16); }
458
459 bool enableMachineScheduler() const override { return true; }
460
461 bool useAA() const override;
462
463 bool enableSubRegLiveness() const override { return true; }
464
467
468 // static wrappers
469 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
470
471 // XXX - Why is this here if it isn't in the default pass set?
472 bool enableEarlyIfConversion() const override { return true; }
473
475 const SchedRegion &Region) const override;
476
478 const SchedRegion &Region) const override;
479
480 void mirFileLoaded(MachineFunction &MF) const override;
481
482 unsigned getMaxNumUserSGPRs() const {
483 return AMDGPU::getMaxNumUserSGPRs(*this);
484 }
485
486 bool useVGPRIndexMode() const;
487
488 bool hasScalarCompareEq64() const {
490 }
491
492 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
493 bool hasLDSFPAtomicAddF64() const {
494 return HasGFX90AInsts || HasGFX1250Insts;
495 }
496
497 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
498 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
499
500 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
501 bool hasPermLane64() const { return getGeneration() >= GFX11; }
502
503 bool hasDPPRowShare() const {
504 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
505 }
506
507 // Has V_PK_MOV_B32 opcode
508 bool hasPkMovB32() const { return HasGFX90AInsts; }
509
511 return getGeneration() >= GFX10 || hasGFX940Insts();
512 }
513
514 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
515
516 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
517
518 unsigned getNSAMaxSize(bool HasSampler = false) const {
519 return AMDGPU::getNSAMaxSize(*this, HasSampler);
520 }
521
522 bool hasMadF16() const;
523
524 // Scalar and global loads support scale_offset bit.
525 bool hasScaleOffset() const { return HasGFX1250Insts; }
526
527 // FLAT GLOBAL VOffset is signed
528 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
529
531
533 return HasUserSGPRInit16Bug && isWave32();
534 }
535
539
540 // \returns true if the subtarget supports DWORDX3 load/store instructions.
541 bool hasDwordx3LoadStores() const { return HasCIInsts; }
542
546
551
554 }
555
558 }
559
561 return HasLDSMisalignedBug && !EnableCuMode;
562 }
563
564 // Shift amount of a 64 bit shift cannot be a highest allocated register
565 // if also at the end of the allocation block.
566 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
567
568 // Has one cycle hazard on transcendental instruction feeding a
569 // non transcendental VALU.
570 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
571
572 // Has one cycle hazard on a VALU instruction partially writing dst with
573 // a shift of result bits feeding another VALU instruction.
574 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
575
576 // Cannot use op_sel with v_dot instructions.
577 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
578
579 // Does not have HW interlocs for VALU writing and then reading SGPRs.
580 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
581
582 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
583
585 return getGeneration() == GFX10;
586 }
587
588 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
589
590 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
591
592 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
593
595 return getGeneration() == GFX11;
596 }
597
598 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
599
600 // All GFX9 targets experience a fetch delay when an instruction at the start
601 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
602 // is uniquely sensitive to this: the delay triggers further performance
603 // degradation beyond the fetch latency itself.
604 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
605
606 bool requiresCodeObjectV6() const { return RequiresCOV6; }
607
608 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
609
610 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
611
613 return HasGFX12Insts && !HasGFX1250Insts;
614 }
615
616 bool setRegModeNeedsVNOPs() const {
617 return HasGFX1250Insts && getGeneration() == GFX12;
618 }
619
620 /// Return if operations acting on VGPR tuples require even alignment.
621 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
622
623 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
624 bool hasSPackHL() const { return HasGFX11Insts; }
625
626 /// Return true if the target's EXP instruction has the COMPR flag, which
627 /// affects the meaning of the EN (enable) bits.
628 bool hasCompressedExport() const { return !HasGFX11Insts; }
629
630 /// Return true if the target's EXP instruction supports the NULL export
631 /// target.
632 bool hasNullExportTarget() const { return !HasGFX11Insts; }
633
634 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
635
636 /// Return true if the target has the S_DELAY_ALU instruction.
637 bool hasDelayAlu() const { return HasGFX11Insts; }
638
639 /// Returns true if the target supports
640 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
641 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
642 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
643
644 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
645 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
646 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
647
648 /// \returns true if the target has packed f32 instructions that only read 32
649 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
650 /// both channels.
652 return getGeneration() == GFX12 && HasGFX1250Insts;
653 }
654
655 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
656
657 /// \returns true if the target supports expert scheduling mode 2 which relies
658 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
659 /// instructions in some instances.
660 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
661
662 /// \returns The maximum number of instructions that can be enclosed in an
663 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
664 /// instruction.
665 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
666
667 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
668 /// SGPRs
669 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
670
671 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
672 /// VGPRs
673 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
674 unsigned DynamicVGPRBlockSize) const;
675
676 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
677 /// be achieved when the only function running on a CU is \p F, each workgroup
678 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
679 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
680 /// range, so this returns a range as well.
681 ///
682 /// Note that occupancy can be affected by the scratch allocation as well, but
683 /// we do not have enough information to compute it.
684 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
685 unsigned LDSSize = 0,
686 unsigned NumSGPRs = 0,
687 unsigned NumVGPRs = 0) const;
688
689 /// \returns true if the flat_scratch register should be initialized with the
690 /// pointer to the wave's scratch memory rather than a size and offset.
691 bool flatScratchIsPointer() const {
693 }
694
695 /// \returns true if the machine has merged shaders in which s0-s7 are
696 /// reserved by the hardware and user SGPRs start at s8
697 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
698
699 // \returns true if the target supports the pre-NGG legacy geometry path.
700 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
701
702 // \returns true if the target has split barriers feature
703 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
704
705 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
706 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
707
708 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
709 /// values.
710 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
711
712 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
713
714 bool hasVOPD3() const { return HasGFX1250Insts; }
715
716 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
717 bool hasIntMinMax64() const { return HasGFX1250Insts; }
718
719 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
720 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
721
722 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
723 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
724
725 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
726 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
727 // extended VA to 57 bits.
729 return HasGFX12Insts && !HasGFX1250Insts;
730 }
731
732 // \returns true if the target needs to create a prolog for backward
733 // compatibility when preloading kernel arguments.
735 return hasKernargPreload() && !HasGFX1250Insts;
736 }
737
738 bool hasCondSubInsts() const { return HasGFX12Insts; }
739
740 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
741
742 bool hasFmaLegacy32Insts() const { return hasGFX10_3Insts(); }
743
744 /// \returns SGPR allocation granularity supported by the subtarget.
745 unsigned getSGPRAllocGranule() const {
747 }
748
749 /// \returns SGPR encoding granularity supported by the subtarget.
750 unsigned getSGPREncodingGranule() const {
752 }
753
754 /// \returns Total number of SGPRs supported by the subtarget.
755 unsigned getTotalNumSGPRs() const {
757 }
758
759 /// \returns Addressable number of SGPRs supported by the subtarget.
760 unsigned getAddressableNumSGPRs() const {
762 }
763
764 /// \returns Minimum number of SGPRs that meets the given number of waves per
765 /// execution unit requirement supported by the subtarget.
766 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
767 return AMDGPU::IsaInfo::getMinNumSGPRs(*this, WavesPerEU);
768 }
769
770 /// \returns Maximum number of SGPRs that meets the given number of waves per
771 /// execution unit requirement supported by the subtarget.
772 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
773 return AMDGPU::IsaInfo::getMaxNumSGPRs(*this, WavesPerEU, Addressable);
774 }
775
776 /// \returns Reserved number of SGPRs. This is common
777 /// utility function called by MachineFunction and
778 /// Function variants of getReservedNumSGPRs.
779 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
780 /// \returns Reserved number of SGPRs for given machine function \p MF.
781 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
782
783 /// \returns Reserved number of SGPRs for given function \p F.
784 unsigned getReservedNumSGPRs(const Function &F) const;
785
786 /// \returns Maximum number of preloaded SGPRs for the subtarget.
787 unsigned getMaxNumPreloadedSGPRs() const;
788
789 /// \returns max num SGPRs. This is the common utility
790 /// function called by MachineFunction and Function
791 /// variants of getMaxNumSGPRs.
792 unsigned getBaseMaxNumSGPRs(const Function &F,
793 std::pair<unsigned, unsigned> WavesPerEU,
794 unsigned PreloadedSGPRs,
795 unsigned ReservedNumSGPRs) const;
796
797 /// \returns Maximum number of SGPRs that meets number of waves per execution
798 /// unit requirement for function \p MF, or number of SGPRs explicitly
799 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
800 ///
801 /// \returns Value that meets number of waves per execution unit requirement
802 /// if explicitly requested value cannot be converted to integer, violates
803 /// subtarget's specifications, or does not meet number of waves per execution
804 /// unit requirement.
805 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
806
807 /// \returns Maximum number of SGPRs that meets number of waves per execution
808 /// unit requirement for function \p F, or number of SGPRs explicitly
809 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
810 ///
811 /// \returns Value that meets number of waves per execution unit requirement
812 /// if explicitly requested value cannot be converted to integer, violates
813 /// subtarget's specifications, or does not meet number of waves per execution
814 /// unit requirement.
815 unsigned getMaxNumSGPRs(const Function &F) const;
816
817 /// \returns VGPR allocation granularity supported by the subtarget.
818 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
819 return AMDGPU::IsaInfo::getVGPRAllocGranule(*this, DynamicVGPRBlockSize);
820 }
821
822 /// \returns VGPR encoding granularity supported by the subtarget.
823 unsigned getVGPREncodingGranule() const {
825 }
826
827 /// \returns Total number of VGPRs supported by the subtarget.
828 unsigned getTotalNumVGPRs() const {
830 }
831
832 /// \returns Addressable number of architectural VGPRs supported by the
833 /// subtarget.
837
838 /// \returns Addressable number of VGPRs supported by the subtarget.
839 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
840 return AMDGPU::IsaInfo::getAddressableNumVGPRs(*this, DynamicVGPRBlockSize);
841 }
842
843 /// \returns the minimum number of VGPRs that will prevent achieving more than
844 /// the specified number of waves \p WavesPerEU.
845 unsigned getMinNumVGPRs(unsigned WavesPerEU,
846 unsigned DynamicVGPRBlockSize) const {
847 return AMDGPU::IsaInfo::getMinNumVGPRs(*this, WavesPerEU,
848 DynamicVGPRBlockSize);
849 }
850
851 /// \returns the maximum number of VGPRs that can be used and still achieved
852 /// at least the specified number of waves \p WavesPerEU.
853 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
854 unsigned DynamicVGPRBlockSize) const {
855 return AMDGPU::IsaInfo::getMaxNumVGPRs(*this, WavesPerEU,
856 DynamicVGPRBlockSize);
857 }
858
859 /// \returns max num VGPRs. This is the common utility function
860 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
861 unsigned
863 std::pair<unsigned, unsigned> NumVGPRBounds) const;
864
865 /// \returns Maximum number of VGPRs that meets number of waves per execution
866 /// unit requirement for function \p F, or number of VGPRs explicitly
867 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
868 ///
869 /// \returns Value that meets number of waves per execution unit requirement
870 /// if explicitly requested value cannot be converted to integer, violates
871 /// subtarget's specifications, or does not meet number of waves per execution
872 /// unit requirement.
873 unsigned getMaxNumVGPRs(const Function &F) const;
874
875 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
876
877 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
878 /// of waves per execution unit required for the function \p MF.
879 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
880
881 /// \returns Maximum number of VGPRs that meets number of waves per execution
882 /// unit requirement for function \p MF, or number of VGPRs explicitly
883 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
884 ///
885 /// \returns Value that meets number of waves per execution unit requirement
886 /// if explicitly requested value cannot be converted to integer, violates
887 /// subtarget's specifications, or does not meet number of waves per execution
888 /// unit requirement.
889 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
890
891 bool supportsWave32() const { return getGeneration() >= GFX10; }
892
893 bool supportsWave64() const { return !hasGFX1250Insts() || HasGFX13Insts; }
894
895 bool isWave32() const { return getWavefrontSize() == 32; }
896
897 bool isWave64() const { return getWavefrontSize() == 64; }
898
899 /// Returns if the wavesize of this subtarget is known reliable. This is false
900 /// only for the a default target-cpu that does not have an explicit
901 /// +wavefrontsize target feature.
902 bool isWaveSizeKnown() const {
903 return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
904 hasFeature(AMDGPU::FeatureWavefrontSize64);
905 }
906
908 return getRegisterInfo()->getBoolRC();
909 }
910
911 /// \returns Maximum number of work groups per compute unit supported by the
912 /// subtarget and limited by given \p FlatWorkGroupSize.
913 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
914 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(*this, FlatWorkGroupSize);
915 }
916
917 /// \returns Minimum flat work group size supported by the subtarget.
918 unsigned getMinFlatWorkGroupSize() const override {
920 }
921
922 /// \returns Maximum flat work group size supported by the subtarget.
923 unsigned getMaxFlatWorkGroupSize() const override {
925 }
926
927 /// \returns Number of waves per execution unit required to support the given
928 /// \p FlatWorkGroupSize.
929 unsigned
930 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
931 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(*this, FlatWorkGroupSize);
932 }
933
934 /// \returns Minimum number of waves per execution unit supported by the
935 /// subtarget.
936 unsigned getMinWavesPerEU() const override {
938 }
939
940 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
941 SDep &Dep,
942 const TargetSchedModel *SchedModel) const override;
943
944 // \returns true if it's beneficial on this subtarget for the scheduler to
945 // cluster stores as well as loads.
946 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
947
948 // \returns the number of address arguments from which to enable MIMG NSA
949 // on supported architectures.
950 unsigned getNSAThreshold(const MachineFunction &MF) const;
951
952 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
953 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
954 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
955
956 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
957 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
958 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
959
960 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
961 unsigned getDynamicVGPRBlockSize() const {
962 return DynamicVGPRBlockSize32 ? 32 : 16;
963 }
964
966 // AMDGPU doesn't care if early-clobber and undef operands are allocated
967 // to the same register.
968 return false;
969 }
970
971 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
972 // and surronded by S_WAIT_ALU(0xFFE3).
974 return getGeneration() == GFX12;
975 }
976
977 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
978 // read.
980 return HasGFX1250Insts && getGeneration() == GFX12;
981 }
982
983 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
984 // result.
986 return HasGFX1250Insts && getGeneration() == GFX12;
987 }
988
989 /// \returns true if the subtarget requires a wait for xcnt before VMEM
990 /// accesses that must never be repeated in the event of a page fault/re-try.
991 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
993 return HasGFX1250Insts;
994 }
995
996 /// \returns the number of significant bits in the immediate field of the
997 /// S_NOP instruction.
998 unsigned getSNopBits() const {
1000 return 7;
1002 return 4;
1003 return 3;
1004 }
1005
1009
1011 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1013 isWave32();
1014 }
1015
1016 /// Return true if real (non-fake) variants of True16 instructions using
1017 /// 16-bit registers should be code-generated. Fake True16 instructions are
1018 /// identical to non-fake ones except that they take 32-bit registers as
1019 /// operands and always use their low halves.
1020 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1021 // supported and the support for fake True16 instructions is removed.
1022 bool useRealTrue16Insts() const {
1023 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1024 }
1025
1027 return getGeneration() >= GFX10 || isTgSplitEnabled();
1028 }
1029};
1030
1032public:
1033 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1034
1035 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1036
1037 bool hasDispatchPtr() const { return DispatchPtr; }
1038
1039 bool hasQueuePtr() const { return QueuePtr; }
1040
1041 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1042
1043 bool hasDispatchID() const { return DispatchID; }
1044
1045 bool hasFlatScratchInit() const { return FlatScratchInit; }
1046
1047 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1048
1049 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1050
1051 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1052
1053 unsigned getNumFreeUserSGPRs();
1054
1055 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1056
1067
1068 // Returns the size in number of SGPRs for preload user SGPR field.
1070 switch (ID) {
1072 return 2;
1074 return 4;
1075 case DispatchPtrID:
1076 return 2;
1077 case QueuePtrID:
1078 return 2;
1080 return 2;
1081 case DispatchIdID:
1082 return 2;
1083 case FlatScratchInitID:
1084 return 2;
1086 return 1;
1087 }
1088 llvm_unreachable("Unknown UserSGPRID.");
1089 }
1090
1091 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1092
1093private:
1094 const GCNSubtarget &ST;
1095
1096 // Private memory buffer
1097 // Compute directly in sgpr[0:1]
1098 // Other shaders indirect 64-bits at sgpr[0:1]
1099 bool ImplicitBufferPtr = false;
1100
1101 bool PrivateSegmentBuffer = false;
1102
1103 bool DispatchPtr = false;
1104
1105 bool QueuePtr = false;
1106
1107 bool KernargSegmentPtr = false;
1108
1109 bool DispatchID = false;
1110
1111 bool FlatScratchInit = false;
1112
1113 bool PrivateSegmentSize = false;
1114
1115 unsigned NumKernargPreloadSGPRs = 0;
1116
1117 unsigned NumUsedUserSGPRs = 0;
1118};
1119
1120} // end namespace llvm
1121
1122#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
AMDHSA kernel descriptor definitions.
#define F(x, y, z)
Definition MD5.cpp:54
Promote Memory to Register
Definition Mem2Reg.cpp:110
SI DAG Lowering interface definition.
Interface definition for SIInstrInfo.
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned getMaxWavesPerEU() const
unsigned getWavefrontSize() const
bool hasPrefetch() const
bool hasFlat() const
bool hasD16Images() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
bool partialVCCWritesUpdateVCCZ() const
Writes to VCC_LO/VCC_HI update the VCCZ flag.
bool hasSwap() const
bool hasPkMinMax3Insts() const
bool hasD16LoadStore() const
bool hasMergedShaders() const
bool hasRrWGMode() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasOnlyRevVALUShifts() const
bool hasNonNSAEncoding() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasExpertSchedulingMode() const
void mirFileLoaded(MachineFunction &MF) const override
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool loadStoreOptEnabled() const
bool enableSubRegLiveness() const override
unsigned getSGPRAllocGranule() const
bool hasLdsAtomicAddF64() const
bool hasFlatLgkmVMemCountInOrder() const
bool flatScratchIsPointer() const
bool requiresWaitOnWorkgroupReleaseFence() const
bool hasShift64HighRegBug() const
unsigned MaxPrivateElementSize
bool unsafeDSOffsetFoldingEnabled() const
bool hasFPAtomicToDenormModeHazard() const
unsigned getAddressableNumArchVGPRs() const
bool vmemWriteNeedsExpWaitcnt() const
bool shouldClusterStores() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasUserSGPRInit16BugInWave32() const
unsigned getSGPREncodingGranule() const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasFlatScratchHiInB64InstHazard() const
bool hasDstSelForwardingHazard() const
void setScalarizeGlobalBehavior(bool b)
bool hasFlatScratchEnabled() const
unsigned getSNopBits() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasMultiDwordFlatScratchAddressing() const
bool hasFmaakFmamkF64Insts() const
bool hasHWFP64() const
bool hasScaleOffset() const
bool hasDenormModeInst() const
bool hasCvtScaleForwardingHazard() const
unsigned getTotalNumVGPRs() const
unsigned getMinWavesPerEU() const override
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasCondSubInsts() const
const InlineAsmLowering * getInlineAsmLowering() const override
unsigned getTotalNumSGPRs() const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Align getStackAlignment() const
bool privateMemoryResourceIsRangeChecked() const
bool hasScalarSubwordLoads() const
bool hasMadF16() const
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
bool hasLoopHeadInstSplitSensitivity() const
bool hasDwordx3LoadStores() const
bool hasSignedScratchOffsets() const
bool hasGlobalAddTidInsts() const
bool hasFlatScrRegister() const
bool hasGetPCZeroExtension() const
bool hasPermLane64() const
bool requiresNopBeforeDeallocVGPRs() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool supportsGetDoorbellID() const
bool supportsWave32() const
bool isTgSplitEnabled() const
unsigned getMaxNumAGPRs(const Function &F) const
bool hasReadM0MovRelInterpHazard() const
bool isDynamicVGPREnabled() const
bool hasInstPrefSize() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasDOTOpSelHazard() const
bool hasLdsWaitVMSRC() const
const TargetRegisterClass * getBoolRC() const
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool hasFmaakFmamkF32Insts() const
bool hasMad64_32() const
InstructionSelector * getInstructionSelector() const override
unsigned getVGPREncodingGranule() const
bool hasHardClauses() const
bool useDS128() const
bool hasExtendedWaitCounts() const
bool d16PreservesUnusedBits() const
bool hasInstPrefetch() const
bool hasAddPC64Inst() const
unsigned maxHardClauseLength() const
bool isMesaGfxShader(const Function &F) const
bool hasExportInsts() const
bool hasVINTERPEncoding() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
bool hasLegacyGeometry() const
TrapHandlerAbi getTrapHandlerAbi() const
bool isCuModeEnabled() const
const SIFrameLowering * getFrameLowering() const override
bool hasDPPRowShare() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool useVGPRBlockOpsForCSR() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool needsKernArgPreloadProlog() const
bool hasMin3Max3_16() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
const SITargetLowering * getTargetLowering() const override
bool hasTransForwardingHazard() const
bool enableMachineScheduler() const override
bool hasLDSFPAtomicAddF64() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool getScalarizeGlobalBehavior() const
bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const
bool hasReadM0LdsDmaHazard() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasScratchBaseForwardingHazard() const
bool hasIntMinMax64() const
bool hasScalarPackInsts() const
bool requiresDisjointEarlyClobberAndUndef() const override
bool hasVALUReadSGPRHazard() const
bool usePRTStrictNull() const
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const
bool supportsWaveWideBPermute() const
bool hasMed3_16() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasUnalignedScratchAccessEnabled() const
bool hasNullExportTarget() const
Return true if the target's EXP instruction supports the NULL export target.
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool useAA() const override
bool isWave32() const
bool isGFX11Plus() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool hasUnalignedBufferAccessEnabled() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getMinFlatWorkGroupSize() const override
bool hasAsyncMark() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool supportsMinMaxDenormModes() const
bool supportsWave64() const
bool supportsBPermute() const
bool hasFlatScratchSVSMode() const
unsigned InstCacheLineSize
bool supportsWGP() const
bool hasAtomicFaddInsts() const
bool hasSubClampInsts() const
bool requiresWaitXCntForSingleAccessInstructions() const
unsigned getNSAMaxSize(bool HasSampler=false) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool hasVOP3DPP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMaxNumUserSGPRs() const
unsigned MaxHardClauseLength
The maximum number of instructions that may be placed within an S_CLAUSE, which is one greater than t...
bool hasPermLaneX16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasVDecCoExecHazard() const
bool hasSignedGVSOffset() const
bool hasLDSFPAtomicAddF32() const
unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDelayAlu() const
Return true if the target has the S_DELAY_ALU instruction.
bool hasReadM0SendMsgHazard() const
bool hasScalarMulHiInsts() const
bool hasSCmpK() const
const LegalizerInfo * getLegalizerInfo() const override
bool requiresWaitIdleBeforeGetReg() const
bool hasDS96AndDS128() const
bool hasFmaLegacy32Insts() const
bool hasReadM0LdsDirectHazard() const
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI)
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
bool hasScalarAddSub64() const
bool hasSplitBarriers() const
bool enableEarlyIfConversion() const override
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasSGetShaderCyclesInst() const
bool hasINVWBL2WaitCntRequirement() const
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasFlatScratchSTMode() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasGWSSemaReleaseAll() const
bool hasAddr64() const
unsigned getAddressableNumSGPRs() const
bool hasReadVCCZBug() const
Extra wait hazard is needed in some cases before s_cbranch_vccnz/s_cbranch_vccz.
bool isWave64() const
unsigned getDynamicVGPRBlockSize() const
bool setRegModeNeedsVNOPs() const
bool hasFractBug() const
bool isPreciseMemoryEnabled() const
unsigned getMaxWaveScratchSize() const
bool hasLDSMisalignedBugInWGPMode() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool hasVOPD3() const
bool hasAtomicCSub() const
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
bool requiresCodeObjectV6() const
const CallLowering * getCallLowering() const override
bool hasLdsDirect() const
bool hasGWSAutoReplay() const
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
unsigned getNumKernargPreloadSGPRs() const
unsigned getNumUsedUserSGPRs() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Itinerary data supplied by a subtarget to be used by a target.
Scheduling dependency.
Definition ScheduleDAG.h:51
const TargetRegisterClass * getBoolRC() const
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getSGPRAllocGranule(const MCSubtargetInfo &STI)
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo &STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo &STI)
unsigned getAddressableNumSGPRs(const MCSubtargetInfo &STI)
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo &STI)
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
unsigned getMinNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU)
unsigned getMaxNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, bool Addressable)
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
constexpr unsigned getMaxFlatWorkGroupSize()
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMinNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize)
unsigned getMaxNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getMinWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
A region of an MBB for scheduling.