[llvm] 49f2093 - [AMDGPU] Increase LDS to 320K on gfx1250 (#153645)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 14 12:52:03 PDT 2025
Author: Stanislav Mekhanoshin
Date: 2025-08-14T12:52:00-07:00
New Revision: 49f20934776cccbed82ccdca657d9111bf550286
URL: https://github.com/llvm/llvm-project/commit/49f20934776cccbed82ccdca657d9111bf550286
DIFF: https://github.com/llvm/llvm-project/commit/49f20934776cccbed82ccdca657d9111bf550286.diff
LOG: [AMDGPU] Increase LDS to 320K on gfx1250 (#153645)
Added:
llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5343d66b083c7..8d0786ab0440d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
roundup(lds-size / (128 * 4))
GFX950
roundup(lds-size / (320 * 4))
+ GFX125*
+ roundup(lds-size / (256 * 4))
24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
_INVALID_OPERATION with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f26639847be75..8e4b6365dc06b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
"gfx12",
- [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
+ [FeatureFP64, FeatureMIMG_R128,
FeatureFlatAddressSpace, Feature16BitInsts,
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
+ FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureCUStores,
+ FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 626734a4752f3..c7d2d268a2707 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1103,7 +1103,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+ if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+ // LDS is allocated in 256 dword blocks.
+ LDSAlignShift = 10;
+ } else if (STM.getFeatureBits().test(
+ FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1faeb6f545..d14b5ce80d28e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e0ac040bdd226..ec9f1abdd8467 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
+ if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 327680;
return 0;
}
@@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- // Currently this is 128 for all subtargets
- return 128;
+ return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+ : 128;
}
bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e1ce5341efdd1..4349b18fd394c 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
@@ -29,6 +31,11 @@
; GFX1200-MESA: .long 45100
; GFX1200-MESA-NEXT: .long 1024
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX1250-MESA: .long 45100
+; GFX1250-MESA-NEXT: .long 512
+
@lds = internal addrspace(3) global [4096 x i8] poison
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
new file mode 100644
index 0000000000000..da92dcdd7104e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX1250 supports upto 320 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
+ at dst = addrspace(3) global [81921 x i32] undef
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+ %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+ store i32 %val, ptr addrspace(3) %gep
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
new file mode 100644
index 0000000000000..3db0fa8f21759
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+ at lds.i8 = addrspace(3) global i8 undef
+ at lds.array.i8 = addrspace(3) global [327679 x i8] undef
+ at lds.i16 = addrspace(3) global i16 undef
+ at lds.array.i16 = addrspace(3) global [163839 x i16] undef
+ at lds.i32 = addrspace(3) global i32 undef
+ at lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+; GCN-LABEL: test_lds_i8:
+; GCN: .amdhsa_group_segment_fixed_size 1
+; GCN: ; LDSByteSize: 1 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_i16:
+; GCN: .amdhsa_group_segment_fixed_size 2
+; GCN: ; LDSByteSize: 2 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_i32:
+; GCN: .amdhsa_group_segment_fixed_size 4
+; GCN: ; LDSByteSize: 4 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i8:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i16:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i32:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
new file mode 100644
index 0000000000000..bfa7d37ce63a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i16:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i32:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i8:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_i16:
+; PAL: .lds_size: 0x2
+; PAL: test_lds_i32:
+; PAL: .lds_size: 0x4
+; PAL: test_lds_i8:
+; PAL: .lds_size: 0x1
+
+ at lds.i8 = addrspace(3) global i8 undef
+ at lds.array.i8 = addrspace(3) global [327679 x i8] undef
+ at lds.i16 = addrspace(3) global i16 undef
+ at lds.array.i16 = addrspace(3) global [163839 x i16] undef
+ at lds.i32 = addrspace(3) global i32 undef
+ at lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+define amdgpu_gfx void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
More information about the llvm-commits
mailing list