[llvm] [AMDGPU][Draft] OOB mode - module flag (PR #160922)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 26 09:41:27 PDT 2025
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {clang-format}-->
:warning: C/C++ code formatter, clang-format found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
git-clang-format --diff origin/main HEAD --extensions h,cpp -- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/GCNSubtarget.h
``````````
:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:
</details>
<details>
<summary>
View the diff from clang-format here.
</summary>
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cf50a0c6d..2b5616e70 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -296,9 +296,9 @@ protected:
// Setting a bit enables a relaxed mode that disables strict OOB guarantees;
// an out-of-bounds access may cause a neighboring in-bounds access to be
// treated as OOB.
- // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
- // OOBMode{0} - untyped buffers (buffer_load)
- // OOBMode{1} - typed buffers (tbuffer_load)
+ // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding
+ // check strict. OOBMode{0} - untyped buffers (buffer_load) OOBMode{1} - typed
+ // buffers (tbuffer_load)
unsigned OOBMode = 0;
private:
@@ -656,1220 +656,1048 @@ public:
return UnalignedAccessMode;
}
- bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
- void setOOBMode(unsigned val) { OOBMode = val; }
+ bool hasRelaxedBufferOOBMode() const {
+ return OOBMode == 1; // TODO: Use named const/enum.}
+ void setOOBMode(unsigned val) { OOBMode = val; }
- bool hasApertureRegs() const {
- return HasApertureRegs;
- }
-
- bool isTrapHandlerEnabled() const {
- return TrapHandler;
- }
-
- bool isXNACKEnabled() const {
- return TargetID.isXnackOnOrAny();
- }
-
- bool isTgSplitEnabled() const {
- return EnableTgSplit;
- }
-
- bool isCuModeEnabled() const {
- return EnableCuMode;
- }
+ bool hasApertureRegs() const { return HasApertureRegs; }
- bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
-
- bool hasFlatAddressSpace() const {
- return FlatAddressSpace;
- }
-
- bool hasFlatScrRegister() const {
- return hasFlatAddressSpace();
- }
-
- bool hasFlatInstOffsets() const {
- return FlatInstOffsets;
- }
-
- bool hasFlatGlobalInsts() const {
- return FlatGlobalInsts;
- }
+ bool isTrapHandlerEnabled() const { return TrapHandler; }
- bool hasFlatScratchInsts() const {
- return FlatScratchInsts;
- }
+ bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
- // Check if target supports ST addressing mode with FLAT scratch instructions.
- // The ST addressing mode means no registers are used, either VGPR or SGPR,
- // but only immediate offset is swizzled and added to the FLAT scratch base.
- bool hasFlatScratchSTMode() const {
- return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
- }
+ bool isTgSplitEnabled() const { return EnableTgSplit; }
- bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+ bool isCuModeEnabled() const { return EnableCuMode; }
- bool hasScalarFlatScratchInsts() const {
- return ScalarFlatScratchInsts;
- }
+ bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
- bool enableFlatScratch() const {
- return flatScratchIsArchitected() ||
- (EnableFlatScratch && hasFlatScratchInsts());
- }
+ bool hasFlatAddressSpace() const { return FlatAddressSpace; }
- bool hasGlobalAddTidInsts() const {
- return GFX10_BEncoding;
- }
+ bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
- bool hasAtomicCSub() const {
- return GFX10_BEncoding;
- }
+ bool hasFlatInstOffsets() const { return FlatInstOffsets; }
- bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
+ bool hasFlatGlobalInsts() const { return FlatGlobalInsts; }
- bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
+ bool hasFlatScratchInsts() const { return FlatScratchInsts; }
- bool hasExportInsts() const {
- return !hasGFX940Insts() && !hasGFX1250Insts();
- }
+ // Check if target supports ST addressing mode with FLAT scratch
+ // instructions. The ST addressing mode means no registers are used, either
+ // VGPR or SGPR, but only immediate offset is swizzled and added to the FLAT
+ // scratch base.
+ bool hasFlatScratchSTMode() const {
+ return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
+ }
- bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
+ bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
- // DS_ADD_F64/DS_ADD_RTN_F64
- bool hasLdsAtomicAddF64() const {
- return hasGFX90AInsts() || hasGFX1250Insts();
- }
+ bool hasScalarFlatScratchInsts() const { return ScalarFlatScratchInsts; }
- bool hasMultiDwordFlatScratchAddressing() const {
- return getGeneration() >= GFX9;
- }
+ bool enableFlatScratch() const {
+ return flatScratchIsArchitected() ||
+ (EnableFlatScratch && hasFlatScratchInsts());
+ }
- bool hasFlatSegmentOffsetBug() const {
- return HasFlatSegmentOffsetBug;
- }
+ bool hasGlobalAddTidInsts() const { return GFX10_BEncoding; }
- bool hasFlatLgkmVMemCountInOrder() const {
- return getGeneration() > GFX9;
- }
+ bool hasAtomicCSub() const { return GFX10_BEncoding; }
- bool hasD16LoadStore() const {
- return getGeneration() >= GFX9;
- }
+ bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
- bool d16PreservesUnusedBits() const {
- return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
- }
+ bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
- bool hasD16Images() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
+ bool hasExportInsts() const {
+ return !hasGFX940Insts() && !hasGFX1250Insts();
+ }
- /// Return if most LDS instructions have an m0 use that require m0 to be
- /// initialized.
- bool ldsRequiresM0Init() const {
- return getGeneration() < GFX9;
- }
+ bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
- // True if the hardware rewinds and replays GWS operations if a wave is
- // preempted.
- //
- // If this is false, a GWS operation requires testing if a nack set the
- // MEM_VIOL bit, and repeating if so.
- bool hasGWSAutoReplay() const {
- return getGeneration() >= GFX9;
- }
-
- /// \returns if target has ds_gws_sema_release_all instruction.
- bool hasGWSSemaReleaseAll() const {
- return CIInsts;
- }
-
- /// \returns true if the target has integer add/sub instructions that do not
- /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
- /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
- /// for saturation.
- bool hasAddNoCarry() const {
- return AddNoCarryInsts;
- }
+ // DS_ADD_F64/DS_ADD_RTN_F64
+ bool hasLdsAtomicAddF64() const {
+ return hasGFX90AInsts() || hasGFX1250Insts();
+ }
- bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+ bool hasMultiDwordFlatScratchAddressing() const {
+ return getGeneration() >= GFX9;
+ }
- bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
+ bool hasFlatSegmentOffsetBug() const { return HasFlatSegmentOffsetBug; }
- bool hasUnpackedD16VMem() const {
- return HasUnpackedD16VMem;
- }
+ bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
- // Covers VS/PS/CS graphics shaders
- bool isMesaGfxShader(const Function &F) const {
- return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
- }
+ bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
- bool hasMad64_32() const {
- return getGeneration() >= SEA_ISLANDS;
- }
+ bool d16PreservesUnusedBits() const {
+ return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
+ }
- bool hasSDWAOmod() const {
- return HasSDWAOmod;
- }
+ bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
- bool hasSDWAScalar() const {
- return HasSDWAScalar;
- }
+ /// Return if most LDS instructions have an m0 use that require m0 to be
+ /// initialized.
+ bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
- bool hasSDWASdst() const {
- return HasSDWASdst;
- }
+ // True if the hardware rewinds and replays GWS operations if a wave is
+ // preempted.
+ //
+ // If this is false, a GWS operation requires testing if a nack set the
+ // MEM_VIOL bit, and repeating if so.
+ bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
- bool hasSDWAMac() const {
- return HasSDWAMac;
- }
+ /// \returns if target has ds_gws_sema_release_all instruction.
+ bool hasGWSSemaReleaseAll() const { return CIInsts; }
- bool hasSDWAOutModsVOPC() const {
- return HasSDWAOutModsVOPC;
- }
+ /// \returns true if the target has integer add/sub instructions that do not
+ /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
+ /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
+ /// for saturation.
+ bool hasAddNoCarry() const { return AddNoCarryInsts; }
- bool hasDLInsts() const {
- return HasDLInsts;
- }
+ bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
- bool hasFmacF64Inst() const { return HasFmacF64Inst; }
+ bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
- bool hasDot1Insts() const {
- return HasDot1Insts;
- }
+ bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; }
- bool hasDot2Insts() const {
- return HasDot2Insts;
- }
+ // Covers VS/PS/CS graphics shaders
+ bool isMesaGfxShader(const Function &F) const {
+ return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
+ }
- bool hasDot3Insts() const {
- return HasDot3Insts;
- }
+ bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
- bool hasDot4Insts() const {
- return HasDot4Insts;
- }
+ bool hasSDWAOmod() const { return HasSDWAOmod; }
- bool hasDot5Insts() const {
- return HasDot5Insts;
- }
+ bool hasSDWAScalar() const { return HasSDWAScalar; }
- bool hasDot6Insts() const {
- return HasDot6Insts;
- }
+ bool hasSDWASdst() const { return HasSDWASdst; }
- bool hasDot7Insts() const {
- return HasDot7Insts;
- }
+ bool hasSDWAMac() const { return HasSDWAMac; }
- bool hasDot8Insts() const {
- return HasDot8Insts;
- }
+ bool hasSDWAOutModsVOPC() const { return HasSDWAOutModsVOPC; }
- bool hasDot9Insts() const {
- return HasDot9Insts;
- }
+ bool hasDLInsts() const { return HasDLInsts; }
- bool hasDot10Insts() const {
- return HasDot10Insts;
- }
+ bool hasFmacF64Inst() const { return HasFmacF64Inst; }
- bool hasDot11Insts() const {
- return HasDot11Insts;
- }
+ bool hasDot1Insts() const { return HasDot1Insts; }
- bool hasDot12Insts() const {
- return HasDot12Insts;
- }
+ bool hasDot2Insts() const { return HasDot2Insts; }
- bool hasDot13Insts() const {
- return HasDot13Insts;
- }
+ bool hasDot3Insts() const { return HasDot3Insts; }
- bool hasMAIInsts() const {
- return HasMAIInsts;
- }
+ bool hasDot4Insts() const { return HasDot4Insts; }
- bool hasFP8Insts() const {
- return HasFP8Insts;
- }
+ bool hasDot5Insts() const { return HasDot5Insts; }
- bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
+ bool hasDot6Insts() const { return HasDot6Insts; }
- bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
+ bool hasDot7Insts() const { return HasDot7Insts; }
- bool hasPkFmacF16Inst() const {
- return HasPkFmacF16Inst;
- }
+ bool hasDot8Insts() const { return HasDot8Insts; }
- bool hasAtomicFMinFMaxF32GlobalInsts() const {
- return HasAtomicFMinFMaxF32GlobalInsts;
- }
+ bool hasDot9Insts() const { return HasDot9Insts; }
- bool hasAtomicFMinFMaxF64GlobalInsts() const {
- return HasAtomicFMinFMaxF64GlobalInsts;
- }
+ bool hasDot10Insts() const { return HasDot10Insts; }
- bool hasAtomicFMinFMaxF32FlatInsts() const {
- return HasAtomicFMinFMaxF32FlatInsts;
- }
+ bool hasDot11Insts() const { return HasDot11Insts; }
- bool hasAtomicFMinFMaxF64FlatInsts() const {
- return HasAtomicFMinFMaxF64FlatInsts;
- }
+ bool hasDot12Insts() const { return HasDot12Insts; }
- bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
+ bool hasDot13Insts() const { return HasDot13Insts; }
- bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
+ bool hasMAIInsts() const { return HasMAIInsts; }
- bool hasAtomicFaddInsts() const {
- return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
- }
+ bool hasFP8Insts() const { return HasFP8Insts; }
- bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
+ bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
- bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
+ bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
- bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
- return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
- }
+ bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; }
- bool hasAtomicBufferGlobalPkAddF16Insts() const {
- return HasAtomicBufferGlobalPkAddF16Insts;
- }
+ bool hasAtomicFMinFMaxF32GlobalInsts() const {
+ return HasAtomicFMinFMaxF32GlobalInsts;
+ }
- bool hasAtomicGlobalPkAddBF16Inst() const {
- return HasAtomicGlobalPkAddBF16Inst;
- }
+ bool hasAtomicFMinFMaxF64GlobalInsts() const {
+ return HasAtomicFMinFMaxF64GlobalInsts;
+ }
- bool hasAtomicBufferPkAddBF16Inst() const {
- return HasAtomicBufferPkAddBF16Inst;
- }
+ bool hasAtomicFMinFMaxF32FlatInsts() const {
+ return HasAtomicFMinFMaxF32FlatInsts;
+ }
- bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
+ bool hasAtomicFMinFMaxF64FlatInsts() const {
+ return HasAtomicFMinFMaxF64FlatInsts;
+ }
- /// \return true if the target has flat, global, and buffer atomic fadd for
- /// double.
- bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
- return HasFlatBufferGlobalAtomicFaddF64Inst;
- }
+ bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
- /// \return true if the target's flat, global, and buffer atomic fadd for
- /// float supports denormal handling.
- bool hasMemoryAtomicFaddF32DenormalSupport() const {
- return HasMemoryAtomicFaddF32DenormalSupport;
- }
+ bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
- /// \return true if atomic operations targeting fine-grained memory work
- /// correctly at device scope, in allocations in host or peer PCIe device
- /// memory.
- bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
- return HasAgentScopeFineGrainedRemoteMemoryAtomics;
- }
+ bool hasAtomicFaddInsts() const {
+ return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
+ }
- /// \return true is HW emulates system scope atomics unsupported by the PCI-e
- /// via CAS loop.
- bool hasEmulatedSystemScopeAtomics() const {
- return HasEmulatedSystemScopeAtomics;
- }
+ bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
- bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
+ bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
- bool hasDefaultComponentBroadcast() const {
- return HasDefaultComponentBroadcast;
- }
+ bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
+ return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
+ }
- bool hasNoSdstCMPX() const {
- return HasNoSdstCMPX;
- }
+ bool hasAtomicBufferGlobalPkAddF16Insts() const {
+ return HasAtomicBufferGlobalPkAddF16Insts;
+ }
- bool hasVscnt() const {
- return HasVscnt;
- }
+ bool hasAtomicGlobalPkAddBF16Inst() const {
+ return HasAtomicGlobalPkAddBF16Inst;
+ }
- bool hasGetWaveIdInst() const {
- return HasGetWaveIdInst;
- }
+ bool hasAtomicBufferPkAddBF16Inst() const {
+ return HasAtomicBufferPkAddBF16Inst;
+ }
- bool hasSMemTimeInst() const {
- return HasSMemTimeInst;
- }
+ bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
- bool hasShaderCyclesRegister() const {
- return HasShaderCyclesRegister;
- }
+ /// \return true if the target has flat, global, and buffer atomic fadd for
+ /// double.
+ bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
+ return HasFlatBufferGlobalAtomicFaddF64Inst;
+ }
- bool hasShaderCyclesHiLoRegisters() const {
- return HasShaderCyclesHiLoRegisters;
- }
+ /// \return true if the target's flat, global, and buffer atomic fadd for
+ /// float supports denormal handling.
+ bool hasMemoryAtomicFaddF32DenormalSupport() const {
+ return HasMemoryAtomicFaddF32DenormalSupport;
+ }
- bool hasVOP3Literal() const {
- return HasVOP3Literal;
- }
+ /// \return true if atomic operations targeting fine-grained memory work
+ /// correctly at device scope, in allocations in host or peer PCIe device
+ /// memory.
+ bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
+ return HasAgentScopeFineGrainedRemoteMemoryAtomics;
+ }
- bool hasNoDataDepHazard() const {
- return HasNoDataDepHazard;
- }
+ /// \return true is HW emulates system scope atomics unsupported by the
+ /// PCI-e via CAS loop.
+ bool hasEmulatedSystemScopeAtomics() const {
+ return HasEmulatedSystemScopeAtomics;
+ }
- bool vmemWriteNeedsExpWaitcnt() const {
- return getGeneration() < SEA_ISLANDS;
- }
+ bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
- bool hasInstPrefetch() const {
- return getGeneration() == GFX10 || getGeneration() == GFX11;
- }
+ bool hasDefaultComponentBroadcast() const {
+ return HasDefaultComponentBroadcast;
+ }
- bool hasPrefetch() const { return GFX12Insts; }
+ bool hasNoSdstCMPX() const { return HasNoSdstCMPX; }
- bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+ bool hasVscnt() const { return HasVscnt; }
- bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasGetWaveIdInst() const { return HasGetWaveIdInst; }
- bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+ bool hasSMemTimeInst() const { return HasSMemTimeInst; }
- // Has s_cmpk_* instructions.
- bool hasSCmpK() const { return getGeneration() < GFX12; }
+ bool hasShaderCyclesRegister() const { return HasShaderCyclesRegister; }
- // Scratch is allocated in 256 dword per wave blocks for the entire
- // wavefront. When viewed from the perspective of an arbitrary workitem, this
- // is 4-byte aligned.
- //
- // Only 4-byte alignment is really needed to access anything. Transformations
- // on the pointer value itself may rely on the alignment / known low bits of
- // the pointer. Set this to something above the minimum to avoid needing
- // dynamic realignment in common cases.
- Align getStackAlignment() const { return Align(16); }
+ bool hasShaderCyclesHiLoRegisters() const {
+ return HasShaderCyclesHiLoRegisters;
+ }
- bool enableMachineScheduler() const override {
- return true;
- }
+ bool hasVOP3Literal() const { return HasVOP3Literal; }
- bool useAA() const override;
+ bool hasNoDataDepHazard() const { return HasNoDataDepHazard; }
- bool enableSubRegLiveness() const override {
- return true;
- }
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
+ }
- void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
- bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
+ bool hasInstPrefetch() const {
+ return getGeneration() == GFX10 || getGeneration() == GFX11;
+ }
- // static wrappers
- static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
+ bool hasPrefetch() const { return GFX12Insts; }
- // XXX - Why is this here if it isn't in the default pass set?
- bool enableEarlyIfConversion() const override {
- return true;
- }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
- void overrideSchedPolicy(MachineSchedPolicy &Policy,
- const SchedRegion &Region) const override;
+ bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
- void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- const SchedRegion &Region) const override;
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
- void mirFileLoaded(MachineFunction &MF) const override;
+ // Has s_cmpk_* instructions.
+ bool hasSCmpK() const { return getGeneration() < GFX12; }
- unsigned getMaxNumUserSGPRs() const {
- return AMDGPU::getMaxNumUserSGPRs(*this);
- }
+ // Scratch is allocated in 256 dword per wave blocks for the entire
+ // wavefront. When viewed from the perspective of an arbitrary workitem,
+ // this is 4-byte aligned.
+ //
+ // Only 4-byte alignment is really needed to access anything.
+ // Transformations on the pointer value itself may rely on the alignment /
+ // known low bits of the pointer. Set this to something above the minimum to
+ // avoid needing dynamic realignment in common cases.
+ Align getStackAlignment() const { return Align(16); }
- bool hasSMemRealTime() const {
- return HasSMemRealTime;
- }
+ bool enableMachineScheduler() const override { return true; }
- bool hasMovrel() const {
- return HasMovrel;
- }
+ bool useAA() const override;
- bool hasVGPRIndexMode() const {
- return HasVGPRIndexMode;
- }
+ bool enableSubRegLiveness() const override { return true; }
- bool useVGPRIndexMode() const;
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
- bool hasScalarCompareEq64() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
+ // static wrappers
+ static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
- bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
+ // XXX - Why is this here if it isn't in the default pass set?
+ bool enableEarlyIfConversion() const override { return true; }
- bool hasScalarStores() const {
- return HasScalarStores;
- }
+ void overrideSchedPolicy(MachineSchedPolicy & Policy,
+ const SchedRegion &Region) const override;
- bool hasScalarAtomics() const {
- return HasScalarAtomics;
- }
+ void overridePostRASchedPolicy(MachineSchedPolicy & Policy,
+ const SchedRegion &Region) const override;
- bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
- bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
+ void mirFileLoaded(MachineFunction & MF) const override;
- /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
- bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+ unsigned getMaxNumUserSGPRs() const {
+ return AMDGPU::getMaxNumUserSGPRs(*this);
+ }
- /// \returns true if the subtarget has the v_permlane64_b32 instruction.
- bool hasPermLane64() const { return getGeneration() >= GFX11; }
+ bool hasSMemRealTime() const { return HasSMemRealTime; }
- bool hasDPP() const {
- return HasDPP;
- }
+ bool hasMovrel() const { return HasMovrel; }
- bool hasDPPBroadcasts() const {
- return HasDPP && getGeneration() < GFX10;
- }
+ bool hasVGPRIndexMode() const { return HasVGPRIndexMode; }
- bool hasDPPWavefrontShifts() const {
- return HasDPP && getGeneration() < GFX10;
- }
+ bool useVGPRIndexMode() const;
- bool hasDPP8() const {
- return HasDPP8;
- }
+ bool hasScalarCompareEq64() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
- bool hasDPALU_DPP() const {
- return HasDPALU_DPP;
- }
+ bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
- bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
+ bool hasScalarStores() const { return HasScalarStores; }
- bool hasPackedFP32Ops() const {
- return HasPackedFP32Ops;
- }
+ bool hasScalarAtomics() const { return HasScalarAtomics; }
- // Has V_PK_MOV_B32 opcode
- bool hasPkMovB32() const {
- return GFX90AInsts;
- }
+ bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
+ bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
- bool hasFmaakFmamkF32Insts() const {
- return getGeneration() >= GFX10 || hasGFX940Insts();
- }
+ /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+ bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
- bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
+ /// \returns true if the subtarget has the v_permlane64_b32 instruction.
+ bool hasPermLane64() const { return getGeneration() >= GFX11; }
- bool hasImageInsts() const {
- return HasImageInsts;
- }
+ bool hasDPP() const { return HasDPP; }
- bool hasExtendedImageInsts() const {
- return HasExtendedImageInsts;
- }
+ bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
- bool hasR128A16() const {
- return HasR128A16;
- }
+ bool hasDPPWavefrontShifts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
- bool hasA16() const { return HasA16; }
+ bool hasDPP8() const { return HasDPP8; }
- bool hasG16() const { return HasG16; }
+ bool hasDPALU_DPP() const { return HasDPALU_DPP; }
- bool hasOffset3fBug() const {
- return HasOffset3fBug;
- }
+ bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
- bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
+ bool hasPackedFP32Ops() const { return HasPackedFP32Ops; }
- bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
+ // Has V_PK_MOV_B32 opcode
+ bool hasPkMovB32() const { return GFX90AInsts; }
- bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
+ bool hasFmaakFmamkF32Insts() const {
+ return getGeneration() >= GFX10 || hasGFX940Insts();
+ }
- bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
+ bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
- bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
+ bool hasImageInsts() const { return HasImageInsts; }
- bool hasNSAEncoding() const { return HasNSAEncoding; }
+ bool hasExtendedImageInsts() const { return HasExtendedImageInsts; }
- bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
+ bool hasR128A16() const { return HasR128A16; }
- bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
+ bool hasA16() const { return HasA16; }
- unsigned getNSAMaxSize(bool HasSampler = false) const {
- return AMDGPU::getNSAMaxSize(*this, HasSampler);
- }
+ bool hasG16() const { return HasG16; }
- bool hasGFX10_AEncoding() const {
- return GFX10_AEncoding;
- }
+ bool hasOffset3fBug() const { return HasOffset3fBug; }
- bool hasGFX10_BEncoding() const {
- return GFX10_BEncoding;
- }
+ bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
- bool hasGFX10_3Insts() const {
- return GFX10_3Insts;
- }
+ bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
- bool hasMadF16() const;
+ bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
- bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
+ bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
- bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
- // Scalar and global loads support scale_offset bit.
- bool hasScaleOffset() const { return GFX1250Insts; }
+ bool hasNSAEncoding() const { return HasNSAEncoding; }
- bool hasFlatGVSMode() const { return FlatGVSMode; }
+ bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
- // FLAT GLOBAL VOffset is signed
- bool hasSignedGVSOffset() const { return GFX1250Insts; }
+ bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
- bool enableSIScheduler() const {
- return EnableSIScheduler;
- }
+ unsigned getNSAMaxSize(bool HasSampler = false) const {
+ return AMDGPU::getNSAMaxSize(*this, HasSampler);
+ }
- bool loadStoreOptEnabled() const {
- return EnableLoadStoreOpt;
- }
+ bool hasGFX10_AEncoding() const { return GFX10_AEncoding; }
- bool hasSGPRInitBug() const {
- return SGPRInitBug;
- }
+ bool hasGFX10_BEncoding() const { return GFX10_BEncoding; }
- bool hasUserSGPRInit16Bug() const {
- return UserSGPRInit16Bug && isWave32();
- }
+ bool hasGFX10_3Insts() const { return GFX10_3Insts; }
- bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+ bool hasMadF16() const;
- bool hasNegativeUnalignedScratchOffsetBug() const {
- return NegativeUnalignedScratchOffsetBug;
- }
+ bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
- bool hasMFMAInlineLiteralBug() const {
- return HasMFMAInlineLiteralBug;
- }
+ bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
- bool has12DWordStoreHazard() const {
- return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
- }
+ // Scalar and global loads support scale_offset bit.
+ bool hasScaleOffset() const { return GFX1250Insts; }
- // \returns true if the subtarget supports DWORDX3 load/store instructions.
- bool hasDwordx3LoadStores() const {
- return CIInsts;
- }
+ bool hasFlatGVSMode() const { return FlatGVSMode; }
- bool hasReadM0MovRelInterpHazard() const {
- return getGeneration() == AMDGPUSubtarget::GFX9;
- }
+ // FLAT GLOBAL VOffset is signed
+ bool hasSignedGVSOffset() const { return GFX1250Insts; }
- bool hasReadM0SendMsgHazard() const {
- return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
- getGeneration() <= AMDGPUSubtarget::GFX9;
- }
+ bool enableSIScheduler() const { return EnableSIScheduler; }
- bool hasReadM0LdsDmaHazard() const {
- return getGeneration() == AMDGPUSubtarget::GFX9;
- }
+ bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
- bool hasReadM0LdsDirectHazard() const {
- return getGeneration() == AMDGPUSubtarget::GFX9;
- }
+ bool hasSGPRInitBug() const { return SGPRInitBug; }
- bool hasVcmpxPermlaneHazard() const {
- return HasVcmpxPermlaneHazard;
- }
+ bool hasUserSGPRInit16Bug() const {
+ return UserSGPRInit16Bug && isWave32();
+ }
- bool hasVMEMtoScalarWriteHazard() const {
- return HasVMEMtoScalarWriteHazard;
- }
+ bool hasNegativeScratchOffsetBug() const {
+ return NegativeScratchOffsetBug;
+ }
- bool hasSMEMtoVectorWriteHazard() const {
- return HasSMEMtoVectorWriteHazard;
- }
+ bool hasNegativeUnalignedScratchOffsetBug() const {
+ return NegativeUnalignedScratchOffsetBug;
+ }
- bool hasLDSMisalignedBug() const {
- return LDSMisalignedBug && !EnableCuMode;
- }
+ bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; }
- bool hasInstFwdPrefetchBug() const {
- return HasInstFwdPrefetchBug;
- }
+ bool has12DWordStoreHazard() const {
+ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
- bool hasVcmpxExecWARHazard() const {
- return HasVcmpxExecWARHazard;
- }
+ // \returns true if the subtarget supports DWORDX3 load/store instructions.
+ bool hasDwordx3LoadStores() const { return CIInsts; }
- bool hasLdsBranchVmemWARHazard() const {
- return HasLdsBranchVmemWARHazard;
- }
+ bool hasReadM0MovRelInterpHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
- // Shift amount of a 64 bit shift cannot be a highest allocated register
- // if also at the end of the allocation block.
- bool hasShift64HighRegBug() const {
- return GFX90AInsts && !GFX940Insts;
- }
+ bool hasReadM0SendMsgHazard() const {
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ getGeneration() <= AMDGPUSubtarget::GFX9;
+ }
- // Has one cycle hazard on transcendental instruction feeding a
- // non transcendental VALU.
- bool hasTransForwardingHazard() const { return GFX940Insts; }
+ bool hasReadM0LdsDmaHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
- // Has one cycle hazard on a VALU instruction partially writing dst with
- // a shift of result bits feeding another VALU instruction.
- bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+ bool hasReadM0LdsDirectHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
- // Cannot use op_sel with v_dot instructions.
- bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
+ bool hasVcmpxPermlaneHazard() const { return HasVcmpxPermlaneHazard; }
- // Does not have HW interlocs for VALU writing and then reading SGPRs.
- bool hasVDecCoExecHazard() const {
- return GFX940Insts;
- }
+ bool hasVMEMtoScalarWriteHazard() const {
+ return HasVMEMtoScalarWriteHazard;
+ }
- bool hasNSAtoVMEMBug() const {
- return HasNSAtoVMEMBug;
- }
+ bool hasSMEMtoVectorWriteHazard() const {
+ return HasSMEMtoVectorWriteHazard;
+ }
- bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+ bool hasLDSMisalignedBug() const {
+ return LDSMisalignedBug && !EnableCuMode;
+ }
- bool hasHardClauses() const { return MaxHardClauseLength > 0; }
+ bool hasInstFwdPrefetchBug() const { return HasInstFwdPrefetchBug; }
- bool hasGFX90AInsts() const { return GFX90AInsts; }
+ bool hasVcmpxExecWARHazard() const { return HasVcmpxExecWARHazard; }
- bool hasFPAtomicToDenormModeHazard() const {
- return getGeneration() == GFX10;
- }
+ bool hasLdsBranchVmemWARHazard() const { return HasLdsBranchVmemWARHazard; }
- bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
+ // Shift amount of a 64 bit shift cannot be a highest allocated register
+ // if also at the end of the allocation block.
+ bool hasShift64HighRegBug() const { return GFX90AInsts && !GFX940Insts; }
- bool hasLdsDirect() const { return getGeneration() >= GFX11; }
+ // Has one cycle hazard on transcendental instruction feeding a
+ // non transcendental VALU.
+ bool hasTransForwardingHazard() const { return GFX940Insts; }
- bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
+ // Has one cycle hazard on a VALU instruction partially writing dst with
+ // a shift of result bits feeding another VALU instruction.
+ bool hasDstSelForwardingHazard() const { return GFX940Insts; }
- bool hasVALUPartialForwardingHazard() const {
- return getGeneration() == GFX11;
- }
+ // Cannot use op_sel with v_dot instructions.
+ bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
- bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
+ // Does not have HW interlocs for VALU writing and then reading SGPRs.
+ bool hasVDecCoExecHazard() const { return GFX940Insts; }
- bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+ bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; }
- bool requiresCodeObjectV6() const { return RequiresCOV6; }
+ bool hasNSAClauseBug() const { return HasNSAClauseBug; }
- bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
+ bool hasHardClauses() const { return MaxHardClauseLength > 0; }
- bool hasGloballyAddressableScratch() const {
- return HasGloballyAddressableScratch;
- }
+ bool hasGFX90AInsts() const { return GFX90AInsts; }
- bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
+ bool hasFPAtomicToDenormModeHazard() const {
+ return getGeneration() == GFX10;
+ }
- bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+ bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
- bool setRegModeNeedsVNOPs() const {
- return GFX1250Insts && getGeneration() == GFX12;
- }
+ bool hasLdsDirect() const { return getGeneration() >= GFX11; }
- /// Return if operations acting on VGPR tuples require even alignment.
- bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
+ bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
- /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
- bool hasSPackHL() const { return GFX11Insts; }
+ bool hasVALUPartialForwardingHazard() const {
+ return getGeneration() == GFX11;
+ }
- /// Return true if the target's EXP instruction has the COMPR flag, which
- /// affects the meaning of the EN (enable) bits.
- bool hasCompressedExport() const { return !GFX11Insts; }
+ bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
- /// Return true if the target's EXP instruction supports the NULL export
- /// target.
- bool hasNullExportTarget() const { return !GFX11Insts; }
+ bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
- bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
+ bool requiresCodeObjectV6() const { return RequiresCOV6; }
- bool hasVOPDInsts() const { return HasVOPDInsts; }
+ bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
- bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+ bool hasGloballyAddressableScratch() const {
+ return HasGloballyAddressableScratch;
+ }
- /// Return true if the target has the S_DELAY_ALU instruction.
- bool hasDelayAlu() const { return GFX11Insts; }
+ bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasPackedTID() const { return HasPackedTID; }
+ bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
- // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
- // hasGFX90AInsts is also true.
- bool hasGFX940Insts() const { return GFX940Insts; }
+ bool setRegModeNeedsVNOPs() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
- // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
- // hasGFX940Insts and hasGFX90AInsts are also true.
- bool hasGFX950Insts() const { return GFX950Insts; }
+ /// Return if operations acting on VGPR tuples require even alignment.
+ bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
- /// Returns true if the target supports
- /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
- /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
- bool hasLDSLoadB96_B128() const {
- return hasGFX950Insts();
- }
+ /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
+ bool hasSPackHL() const { return GFX11Insts; }
- bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
+ /// Return true if the target's EXP instruction has the COMPR flag, which
+ /// affects the meaning of the EN (enable) bits.
+ bool hasCompressedExport() const { return !GFX11Insts; }
- bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
+ /// Return true if the target's EXP instruction supports the NULL export
+ /// target.
+ bool hasNullExportTarget() const { return !GFX11Insts; }
- bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+ bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
- bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+ bool hasVOPDInsts() const { return HasVOPDInsts; }
- bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
+ bool hasFlatScratchSVSSwizzleBug() const {
+ return getGeneration() == GFX11;
+ }
- bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+ /// Return true if the target has the S_DELAY_ALU instruction.
+ bool hasDelayAlu() const { return GFX11Insts; }
- /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
- /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
- bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+ bool hasPackedTID() const { return HasPackedTID; }
- /// \returns true if inline constants are not supported for F16 pseudo
- /// scalar transcendentals.
- bool hasNoF16PseudoScalarTransInlineConstants() const {
- return getGeneration() == GFX12;
- }
+ // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies
+ // that hasGFX90AInsts is also true.
+ bool hasGFX940Insts() const { return GFX940Insts; }
- /// \returns true if the target has instructions with xf32 format support.
- bool hasXF32Insts() const { return HasXF32Insts; }
+ // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
+ // hasGFX940Insts and hasGFX90AInsts are also true.
+ bool hasGFX950Insts() const { return GFX950Insts; }
- bool hasBitOp3Insts() const { return HasBitOp3Insts; }
+ /// Returns true if the target supports
+ /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+ /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+ bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
- bool hasPermlane16Swap() const { return HasPermlane16Swap; }
- bool hasPermlane32Swap() const { return HasPermlane32Swap; }
- bool hasAshrPkInsts() const { return HasAshrPkInsts; }
+ bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
- bool hasMinimum3Maximum3F32() const {
- return HasMinimum3Maximum3F32;
- }
+ bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
- bool hasMinimum3Maximum3F16() const {
- return HasMinimum3Maximum3F16;
- }
+ bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
- bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+ bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
- bool hasTanhInsts() const { return HasTanhInsts; }
+ bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
- bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
+ bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
- bool hasAddPC64Inst() const { return GFX1250Insts; }
+ /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+ /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+ bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
- bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
+ /// \returns true if inline constants are not supported for F16 pseudo
+ /// scalar transcendentals.
+ bool hasNoF16PseudoScalarTransInlineConstants() const {
+ return getGeneration() == GFX12;
+ }
- bool hasMinimum3Maximum3PKF16() const {
- return HasMinimum3Maximum3PKF16;
- }
+ /// \returns true if the target has instructions with xf32 format support.
+ bool hasXF32Insts() const { return HasXF32Insts; }
- bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
+ bool hasBitOp3Insts() const { return HasBitOp3Insts; }
- /// \returns true if the target has s_wait_xcnt insertion. Supported for
- /// GFX1250.
- bool hasWaitXCnt() const { return HasWaitXcnt; }
+ bool hasPermlane16Swap() const { return HasPermlane16Swap; }
+ bool hasPermlane32Swap() const { return HasPermlane32Swap; }
+ bool hasAshrPkInsts() const { return HasAshrPkInsts; }
- // A single DWORD instructions can use a 64-bit literal.
- bool has64BitLiterals() const { return Has64BitLiterals; }
+ bool hasMinimum3Maximum3F32() const { return HasMinimum3Maximum3F32; }
- bool hasPointSampleAccel() const { return HasPointSampleAccel; }
+ bool hasMinimum3Maximum3F16() const { return HasMinimum3Maximum3F16; }
- bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
+ bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
- /// \returns The maximum number of instructions that can be enclosed in an
- /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
- /// instruction.
- unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
+ bool hasTanhInsts() const { return HasTanhInsts; }
- bool hasPrngInst() const { return HasPrngInst; }
+ bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
- bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
+ bool hasAddPC64Inst() const { return GFX1250Insts; }
- /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
- /// SGPRs
- unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+ bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
- /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
- /// VGPRs
- unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
- unsigned DynamicVGPRBlockSize) const;
+ bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; }
- /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
- /// be achieved when the only function running on a CU is \p F, each workgroup
- /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
- /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
- /// range, so this returns a range as well.
- ///
- /// Note that occupancy can be affected by the scratch allocation as well, but
- /// we do not have enough information to compute it.
- std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
- unsigned LDSSize = 0,
- unsigned NumSGPRs = 0,
- unsigned NumVGPRs = 0) const;
+ bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
- /// \returns true if the flat_scratch register should be initialized with the
- /// pointer to the wave's scratch memory rather than a size and offset.
- bool flatScratchIsPointer() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
+ /// \returns true if the target has s_wait_xcnt insertion. Supported for
+ /// GFX1250.
+ bool hasWaitXCnt() const { return HasWaitXcnt; }
- /// \returns true if the flat_scratch register is initialized by the HW.
- /// In this case it is readonly.
- bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+ // A single DWORD instructions can use a 64-bit literal.
+ bool has64BitLiterals() const { return Has64BitLiterals; }
- /// \returns true if the architected SGPRs are enabled.
- bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
+ bool hasPointSampleAccel() const { return HasPointSampleAccel; }
- /// \returns true if Global Data Share is supported.
- bool hasGDS() const { return HasGDS; }
+ bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
- /// \returns true if Global Wave Sync is supported.
- bool hasGWS() const { return HasGWS; }
+ /// \returns The maximum number of instructions that can be enclosed in an
+ /// S_CLAUSE on the given subtarget, or 0 for targets that do not support
+ /// that instruction.
+ unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
- /// \returns true if the machine has merged shaders in which s0-s7 are
- /// reserved by the hardware and user SGPRs start at s8
- bool hasMergedShaders() const {
- return getGeneration() >= GFX9;
- }
+ bool hasPrngInst() const { return HasPrngInst; }
- // \returns true if the target supports the pre-NGG legacy geometry path.
- bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
+ bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
- // \returns true if preloading kernel arguments is supported.
- bool hasKernargPreload() const { return KernargPreload; }
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+ /// SGPRs
+ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
- // \returns true if the target has split barriers feature
- bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+ /// VGPRs
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
+ unsigned DynamicVGPRBlockSize) const;
- // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
- bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
+ /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that
+ /// can be achieved when the only function running on a CU is \p F, each
+ /// workgroup uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs
+ /// SGPRs and \p NumVGPRs VGPRs. The flat workgroup sizes associated to the
+ /// function are a range, so this returns a range as well.
+ ///
+ /// Note that occupancy can be affected by the scratch allocation as well,
+ /// but we do not have enough information to compute it.
+ std::pair<unsigned, unsigned> computeOccupancy(
+ const Function &F, unsigned LDSSize = 0, unsigned NumSGPRs = 0,
+ unsigned NumVGPRs = 0) const;
- // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
- // no-return form.
- bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
+ /// \returns true if the flat_scratch register should be initialized with
+ /// the pointer to the wave's scratch memory rather than a size and offset.
+ bool flatScratchIsPointer() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
- // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
- bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
+ /// \returns true if the flat_scratch register is initialized by the HW.
+ /// In this case it is readonly.
+ bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
- // \returns true if the target has IEEE kernel descriptor mode bit
- bool hasIEEEMode() const { return getGeneration() < GFX12; }
+ /// \returns true if the architected SGPRs are enabled.
+ bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
- // \returns true if the target has IEEE fminimum/fmaximum instructions
- bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }
+ /// \returns true if Global Data Share is supported.
+ bool hasGDS() const { return HasGDS; }
- // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
- bool hasRrWGMode() const { return getGeneration() >= GFX12; }
+ /// \returns true if Global Wave Sync is supported.
+ bool hasGWS() const { return HasGWS; }
- /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
- /// values.
- bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
+ /// \returns true if the machine has merged shaders in which s0-s7 are
+ /// reserved by the hardware and user SGPRs start at s8
+ bool hasMergedShaders() const { return getGeneration() >= GFX9; }
- bool hasGFX1250Insts() const { return GFX1250Insts; }
+ // \returns true if the target supports the pre-NGG legacy geometry path.
+ bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
- bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if preloading kernel arguments is supported.
+ bool hasKernargPreload() const { return KernargPreload; }
- // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
- bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+ // \returns true if the target has split barriers feature
+ bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
- // \returns true if the target has V_MAD_U32 instruction.
- bool hasMadU32Inst() const { return HasMadU32Inst; }
+ // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
+ bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
- // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
- bool hasVectorMulU64() const { return GFX1250Insts; }
+ // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
+ // no-return form.
+ bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
- // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
- // instructions.
- bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+ // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
+ bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
- // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
- bool hasIntMinMax64() const { return GFX1250Insts; }
+ // \returns true if the target has IEEE kernel descriptor mode bit
+ bool hasIEEEMode() const { return getGeneration() < GFX12; }
- // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
- bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+ // \returns true if the target has IEEE fminimum/fmaximum instructions
+ bool hasIEEEMinimumMaximumInsts() const {
+ return HasIEEEMinimumMaximumInsts;
+ }
- // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
- bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+ // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
+ bool hasRrWGMode() const { return getGeneration() >= GFX12; }
- // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
- bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+ /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
+ /// values.
+ bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
- // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
- bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
+ bool hasGFX1250Insts() const { return GFX1250Insts; }
- // \returns true if target has S_SETPRIO_INC_WG instruction.
- bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
+ bool hasVOPD3() const { return GFX1250Insts; }
- // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
- // of sign-extending. Note that GFX1250 has not only fixed the bug but also
- // extended VA to 57 bits.
- bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
- // \returns true if the target needs to create a prolog for backward
- // compatibility when preloading kernel arguments.
- bool needsKernArgPreloadProlog() const {
- return hasKernargPreload() && !GFX1250Insts;
- }
+ // \returns true if the target has V_MAD_U32 instruction.
+ bool hasMadU32Inst() const { return HasMadU32Inst; }
- /// \returns SGPR allocation granularity supported by the subtarget.
- unsigned getSGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
- }
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
- /// \returns SGPR encoding granularity supported by the subtarget.
- unsigned getSGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
- }
+ // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
+ // instructions.
+ bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
- /// \returns Total number of SGPRs supported by the subtarget.
- unsigned getTotalNumSGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
- }
+ // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+ bool hasIntMinMax64() const { return GFX1250Insts; }
- /// \returns Addressable number of SGPRs supported by the subtarget.
- unsigned getAddressableNumSGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
- }
+ // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
+ bool hasAddMinMaxInsts() const { return GFX1250Insts; }
- /// \returns Minimum number of SGPRs that meets the given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
- }
+ // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
+ bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
- /// \returns Maximum number of SGPRs that meets the given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
- return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
- }
+ // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
+ bool hasPkMinMax3Insts() const { return GFX1250Insts; }
- /// \returns Reserved number of SGPRs. This is common
- /// utility function called by MachineFunction and
- /// Function variants of getReservedNumSGPRs.
- unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
- /// \returns Reserved number of SGPRs for given machine function \p MF.
- unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+ // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
+ bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
- /// \returns Reserved number of SGPRs for given function \p F.
- unsigned getReservedNumSGPRs(const Function &F) const;
+ // \returns true if target has S_SETPRIO_INC_WG instruction.
+ bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
- /// \returns Maximum number of preloaded SGPRs for the subtarget.
- unsigned getMaxNumPreloadedSGPRs() const;
+ // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
+ // of sign-extending. Note that GFX1250 has not only fixed the bug but also
+ // extended VA to 57 bits.
+ bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
- /// \returns max num SGPRs. This is the common utility
- /// function called by MachineFunction and Function
- /// variants of getMaxNumSGPRs.
- unsigned getBaseMaxNumSGPRs(const Function &F,
- std::pair<unsigned, unsigned> WavesPerEU,
- unsigned PreloadedSGPRs,
- unsigned ReservedNumSGPRs) const;
+ // \returns true if the target needs to create a prolog for backward
+ // compatibility when preloading kernel arguments.
+ bool needsKernArgPreloadProlog() const {
+ return hasKernargPreload() && !GFX1250Insts;
+ }
- /// \returns Maximum number of SGPRs that meets number of waves per execution
- /// unit requirement for function \p MF, or number of SGPRs explicitly
- /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+ /// \returns SGPR allocation granularity supported by the subtarget.
+ unsigned getSGPRAllocGranule() const {
+ return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
+ }
- /// \returns Maximum number of SGPRs that meets number of waves per execution
- /// unit requirement for function \p F, or number of SGPRs explicitly
- /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumSGPRs(const Function &F) const;
+ /// \returns SGPR encoding granularity supported by the subtarget.
+ unsigned getSGPREncodingGranule() const {
+ return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
+ }
- /// \returns VGPR allocation granularity supported by the subtarget.
- unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
- }
+ /// \returns Total number of SGPRs supported by the subtarget.
+ unsigned getTotalNumSGPRs() const {
+ return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
+ }
- /// \returns VGPR encoding granularity supported by the subtarget.
- unsigned getVGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
- }
+ /// \returns Addressable number of SGPRs supported by the subtarget.
+ unsigned getAddressableNumSGPRs() const {
+ return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
+ }
- /// \returns Total number of VGPRs supported by the subtarget.
- unsigned getTotalNumVGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
- }
+ /// \returns Minimum number of SGPRs that meets the given number of waves
+ /// per execution unit requirement supported by the subtarget.
+ unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
+ }
- /// \returns Addressable number of architectural VGPRs supported by the
- /// subtarget.
- unsigned getAddressableNumArchVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
- }
+ /// \returns Maximum number of SGPRs that meets the given number of waves
+ /// per execution unit requirement supported by the subtarget.
+ unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+ return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
+ }
- /// \returns Addressable number of VGPRs supported by the subtarget.
- unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
- }
+ /// \returns Reserved number of SGPRs. This is common
+ /// utility function called by MachineFunction and
+ /// Function variants of getReservedNumSGPRs.
+ unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
+ /// \returns Reserved number of SGPRs for given machine function \p MF.
+ unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns Reserved number of SGPRs for given function \p F.
+ unsigned getReservedNumSGPRs(const Function &F) const;
+
+ /// \returns Maximum number of preloaded SGPRs for the subtarget.
+ unsigned getMaxNumPreloadedSGPRs() const;
+
+ /// \returns max num SGPRs. This is the common utility
+ /// function called by MachineFunction and Function
+ /// variants of getMaxNumSGPRs.
+ unsigned getBaseMaxNumSGPRs(
+ const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
+ unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const;
+
+ /// \returns Maximum number of SGPRs that meets number of waves per
+ /// execution unit requirement for function \p MF, or number of SGPRs
+ /// explicitly requested using "amdgpu-num-sgpr" attribute attached to
+ /// function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns Maximum number of SGPRs that meets number of waves per
+ /// execution unit requirement for function \p F, or number of SGPRs
+ /// explicitly requested using "amdgpu-num-sgpr" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumSGPRs(const Function &F) const;
+
+ /// \returns VGPR allocation granularity supported by the subtarget.
+ unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
+ }
- /// \returns the minimum number of VGPRs that will prevent achieving more than
- /// the specified number of waves \p WavesPerEU.
- unsigned getMinNumVGPRs(unsigned WavesPerEU,
- unsigned DynamicVGPRBlockSize) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
- DynamicVGPRBlockSize);
- }
+ /// \returns VGPR encoding granularity supported by the subtarget.
+ unsigned getVGPREncodingGranule() const {
+ return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
+ }
- /// \returns the maximum number of VGPRs that can be used and still achieved
- /// at least the specified number of waves \p WavesPerEU.
- unsigned getMaxNumVGPRs(unsigned WavesPerEU,
- unsigned DynamicVGPRBlockSize) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
- DynamicVGPRBlockSize);
- }
+ /// \returns Total number of VGPRs supported by the subtarget.
+ unsigned getTotalNumVGPRs() const {
+ return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
+ }
- /// \returns max num VGPRs. This is the common utility function
- /// called by MachineFunction and Function variants of getMaxNumVGPRs.
- unsigned
- getBaseMaxNumVGPRs(const Function &F,
- std::pair<unsigned, unsigned> NumVGPRBounds) const;
+ /// \returns Addressable number of architectural VGPRs supported by the
+ /// subtarget.
+ unsigned getAddressableNumArchVGPRs() const {
+ return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+ }
- /// \returns Maximum number of VGPRs that meets number of waves per execution
- /// unit requirement for function \p F, or number of VGPRs explicitly
- /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumVGPRs(const Function &F) const;
+ /// \returns Addressable number of VGPRs supported by the subtarget.
+ unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(this,
+ DynamicVGPRBlockSize);
+ }
- unsigned getMaxNumAGPRs(const Function &F) const {
- return getMaxNumVGPRs(F);
- }
+ /// \returns the minimum number of VGPRs that will prevent achieving more
+ /// than the specified number of waves \p WavesPerEU.
+ unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
+ const {
+ return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
+ DynamicVGPRBlockSize);
+ }
- /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
- /// of waves per execution unit required for the function \p MF.
- std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+ /// \returns the maximum number of VGPRs that can be used and still achieved
+ /// at least the specified number of waves \p WavesPerEU.
+ unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
+ const {
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
+ DynamicVGPRBlockSize);
+ }
- /// \returns Maximum number of VGPRs that meets number of waves per execution
- /// unit requirement for function \p MF, or number of VGPRs explicitly
- /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+ /// \returns max num VGPRs. This is the common utility function
+ /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+ unsigned getBaseMaxNumVGPRs(
+ const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const;
+
+ /// \returns Maximum number of VGPRs that meets number of waves per
+ /// execution unit requirement for function \p F, or number of VGPRs
+ /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
+ /// function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumVGPRs(const Function &F) const;
+
+ unsigned getMaxNumAGPRs(const Function &F) const {
+ return getMaxNumVGPRs(F);
+ }
- bool supportsWave32() const { return getGeneration() >= GFX10; }
+ /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+ /// of waves per execution unit required for the function \p MF.
+ std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
- bool supportsWave64() const { return !hasGFX1250Insts(); }
+ /// \returns Maximum number of VGPRs that meets number of waves per
+ /// execution unit requirement for function \p MF, or number of VGPRs
+ /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
+ /// function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per
+ /// execution unit requirement.
+ unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
+ bool supportsWave32() const { return getGeneration() >= GFX10; }
- bool isWave64() const {
- return getWavefrontSize() == 64;
- }
+ bool supportsWave64() const { return !hasGFX1250Insts(); }
- /// Returns if the wavesize of this subtarget is known reliable. This is false
- /// only for the a default target-cpu that does not have an explicit
- /// +wavefrontsize target feature.
- bool isWaveSizeKnown() const {
- return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
- hasFeature(AMDGPU::FeatureWavefrontSize64);
- }
+ bool isWave32() const { return getWavefrontSize() == 32; }
- const TargetRegisterClass *getBoolRC() const {
- return getRegisterInfo()->getBoolRC();
- }
+ bool isWave64() const { return getWavefrontSize() == 64; }
- /// \returns Maximum number of work groups per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
- }
+ /// Returns if the wavesize of this subtarget is known reliable. This is
+ /// false only for the a default target-cpu that does not have an explicit
+ /// +wavefrontsize target feature.
+ bool isWaveSizeKnown() const {
+ return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
+ hasFeature(AMDGPU::FeatureWavefrontSize64);
+ }
- /// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
- }
+ const TargetRegisterClass *getBoolRC() const {
+ return getRegisterInfo()->getBoolRC();
+ }
- /// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
- }
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+ }
- /// \returns Number of waves per execution unit required to support the given
- /// \p FlatWorkGroupSize.
- unsigned
- getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
- }
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+ }
- /// \returns Minimum number of waves per execution unit supported by the
- /// subtarget.
- unsigned getMinWavesPerEU() const override {
- return AMDGPU::IsaInfo::getMinWavesPerEU(this);
- }
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+ }
- void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
- SDep &Dep,
- const TargetSchedModel *SchedModel) const override;
+ /// \returns Number of waves per execution unit required to support the
+ /// given
+ /// \p FlatWorkGroupSize.
+ unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)
+ const override {
+ return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this,
+ FlatWorkGroupSize);
+ }
- // \returns true if it's beneficial on this subtarget for the scheduler to
- // cluster stores as well as loads.
- bool shouldClusterStores() const { return getGeneration() >= GFX11; }
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const override {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+ }
- // \returns the number of address arguments from which to enable MIMG NSA
- // on supported architectures.
- unsigned getNSAThreshold(const MachineFunction &MF) const;
+ void adjustSchedDependency(
+ SUnit * Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
+ const TargetSchedModel *SchedModel) const override;
- // \returns true if the subtarget has a hazard requiring an "s_nop 0"
- // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
- bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+ // \returns true if it's beneficial on this subtarget for the scheduler to
+ // cluster stores as well as loads.
+ bool shouldClusterStores() const { return getGeneration() >= GFX11; }
- // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
- // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
- bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
+ // \returns the number of address arguments from which to enable MIMG NSA
+ // on supported architectures.
+ unsigned getNSAThreshold(const MachineFunction &MF) const;
- bool isDynamicVGPREnabled() const { return DynamicVGPR; }
- unsigned getDynamicVGPRBlockSize() const {
- return DynamicVGPRBlockSize32 ? 32 : 16;
- }
+ // \returns true if the subtarget has a hazard requiring an "s_nop 0"
+ // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
+ bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
- bool requiresDisjointEarlyClobberAndUndef() const override {
- // AMDGPU doesn't care if early-clobber and undef operands are allocated
- // to the same register.
- return false;
- }
+ // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
+ // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
+ bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
- // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
- // and surronded by S_WAIT_ALU(0xFFE3).
- bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
- return getGeneration() == GFX12;
- }
+ bool isDynamicVGPREnabled() const { return DynamicVGPR; }
+ unsigned getDynamicVGPRBlockSize() const {
+ return DynamicVGPRBlockSize32 ? 32 : 16;
+ }
- // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
- // read.
- bool hasScratchBaseForwardingHazard() const {
- return GFX1250Insts && getGeneration() == GFX12;
- }
+ bool requiresDisjointEarlyClobberAndUndef() const override {
+ // AMDGPU doesn't care if early-clobber and undef operands are allocated
+ // to the same register.
+ return false;
+ }
- /// \returns true if the subtarget supports clusters of workgroups.
- bool hasClusters() const { return GFX1250Insts; }
+ // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
+ // and surronded by S_WAIT_ALU(0xFFE3).
+ bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
+ return getGeneration() == GFX12;
+ }
- /// \returns true if the subtarget requires a wait for xcnt before atomic
- /// flat/global stores & rmw.
- bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+ // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+ // read.
+ bool hasScratchBaseForwardingHazard() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
- /// \returns the number of significant bits in the immediate field of the
- /// S_NOP instruction.
- unsigned getSNopBits() const {
- if (getGeneration() >= AMDGPUSubtarget::GFX12)
- return 7;
- if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- return 4;
- return 3;
- }
+ /// \returns true if the subtarget supports clusters of workgroups.
+ bool hasClusters() const { return GFX1250Insts; }
+
+ /// \returns true if the subtarget requires a wait for xcnt before atomic
+ /// flat/global stores & rmw.
+ bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+
+ /// \returns the number of significant bits in the immediate field of the
+ /// S_NOP instruction.
+ unsigned getSNopBits() const {
+ if (getGeneration() >= AMDGPUSubtarget::GFX12)
+ return 7;
+ if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 4;
+ return 3;
+ }
- /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
- /// num_records.
- bool has45BitNumRecordsBufferResource() const {
- return Has45BitNumRecordsBufferResource;
- }
-};
+ /// \returns true if the sub-target supports buffer resource (V#) with
+ /// 45-bit num_records.
+ bool has45BitNumRecordsBufferResource() const {
+ return Has45BitNumRecordsBufferResource;
+ }
+ };
class GCNUserSGPRUsageInfo {
public:
``````````
</details>
https://github.com/llvm/llvm-project/pull/160922
More information about the llvm-commits
mailing list