[llvm-branch-commits] [llvm] AMDGPU: Increase the LDS size to support to 160 KB for gfx950 (PR #116309)

Mon Nov 18 08:40:15 PST 2024

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/116309

>From 74ed0a510ff829e5e98d9edf0284ee4decfa4bc0 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Wed, 13 Dec 2023 00:27:03 -0500
Subject: [PATCH 1/2] AMDGPU: Increase the LDS size to support to 160 KB for
 gfx950

---
 llvm/docs/AMDGPUUsage.rst                     |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  3 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 12 +++--
 llvm/lib/Target/AMDGPU/AMDGPUFeatures.td      |  1 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  2 +
 llvm/test/CodeGen/AMDGPU/extra-lds-size.ll    |  7 +++
 .../AMDGPU/lds-limit-diagnostics-gfx950.ll    | 13 +++++
 .../CodeGen/AMDGPU/lds-size-hsa-gfx950.ll     | 31 +++++++++++
 .../CodeGen/AMDGPU/lds-size-pal-gfx950.ll     | 26 ++++++++++
 .../tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s | 52 +++++++++++++++++++
 10 files changed, 144 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index b85b680b9c82d3..a25b6feddbeddc 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        roundup(lds-size / (64 * 4))
                                                      GFX7-GFX11
                                                        roundup(lds-size / (128 * 4))
+                                                     GFX950
+                                                       roundup(lds-size / (320 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 35dbf86b7c6f36..a05d4a644d08d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1494,7 +1494,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
   [FeatureFP8Insts,
    FeatureFP8ConversionInsts,
    FeatureCvtFP8VOP1Bug,
-   FeatureGFX950Insts
+   FeatureGFX950Insts,
+   FeatureAddressableLocalMemorySize163840
    ])>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d801f2b1591275..90ece275412c7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    // LDS is allocated in 64 dword blocks.
-    LDSAlignShift = 8;
-  } else {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+    // LDS is allocated in 320 dword blocks.
+    LDSAlignShift = 11;
+  } else if (STM.getFeatureBits().test(
+                 FeatureAddressableLocalMemorySize65536)) {
     // LDS is allocated in 128 dword blocks.
     LDSAlignShift = 9;
+  } else {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index f832a2a55d6229..74d1faeb6f545b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
+def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 01866fbd9da6e7..501d00b1f308d9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 32768;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
     return 65536;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+    return 163840;
   return 0;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 13640b74a7937b..318ecd16a2ccb3 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
 
@@ -17,6 +19,11 @@
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
 
+; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX950-MESA: .long 45100
+; GFX950-MESA-NEXT: .long 512
+
 ; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX1200-MESA: .long 45100
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
new file mode 100644
index 00000000000000..19166b271db775
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX950 supports upto 160 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
+ at dst = addrspace(3) global [40961 x i32] poison
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
new file mode 100644
index 00000000000000..6ebfc9a5e9d4f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s
+
+; gfx950 supports upto 160 KB configurable LDS memory.
+; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.
+
+ at lds.i32 = addrspace(3) global i32 poison
+ at lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
+ at lds.array.size.163840 = addrspace(3) global [40959 x i32] poison
+
+; GCN-LABEL: test_lds_array_size_131076:
+; GCN: .amdhsa_group_segment_fixed_size 131076
+; GCN: ; LDSByteSize: 131076 bytes/workgroup
+; MESA: granulated_lds_size = 65
+define amdgpu_kernel void @test_lds_array_size_131076() {
+  %gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_size_163840:
+; GCN: .amdhsa_group_segment_fixed_size 163840
+; GCN: ; LDSByteSize: 163840 bytes/workgroup
+; MESA: granulated_lds_size = 80
+define amdgpu_kernel void @test_lds_array_size_163840() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
new file mode 100644
index 00000000000000..22cad8ab5f5360
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s
+
+; GFX950supports upto 160 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i32:
+; PAL: .lds_size:       0x28000
+; PAL: test_lds_i32:
+; PAL: .lds_size:       0x4
+
+
+ at lds.i32 = addrspace(3) global i32 poison
+ at lds.array.i32 = addrspace(3) global [40959 x i32] poison
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
new file mode 100644
index 00000000000000..5b9d42c7fad553
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
@@ -0,0 +1,52 @@
+;; Test disassembly for gfx950 kernel descriptor.
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o
+; FIxMe: cmp 1.o 1-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT:	.amdhsa_group_segment_fixed_size 163840
+; CHECK-NEXT:	.amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT:	.amdhsa_kernarg_size 0
+; CHECK-NEXT:	.amdhsa_accum_offset 4
+; CHECK-NEXT:	.amdhsa_tg_split 0
+; CHECK-NEXT:	.amdhsa_next_free_vgpr 8
+; CHECK-NEXT:	.amdhsa_reserve_vcc 0
+; CHECK-NEXT:	.amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT:	.amdhsa_next_free_sgpr 8
+; CHECK-NEXT:	.amdhsa_float_round_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT:	.amdhsa_dx10_clamp 1
+; CHECK-NEXT:	.amdhsa_ieee_mode 1
+; CHECK-NEXT:	.amdhsa_fp16_overflow 0
+; CHECK-NEXT:	.amdhsa_enable_private_segment 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT:	.amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT:	.amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT:	.amdhsa_exception_int_div_zero 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT:	.amdhsa_uses_dynamic_stack 0
+; CHECK-NEXT:.end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_group_segment_fixed_size 163840
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel

>From 6ab31c00bb87ef8488c29e452c47cfd7af8aaac9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 15 Nov 2024 10:51:15 -0800
Subject: [PATCH 2/2] Avoid adding multiple addressablelocalmemorysizes to
 gfx950

Also expand the limit testing.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 16 +++++++---
 .../AMDGPU/lds-limit-diagnostics-gfx950.ll    | 13 --------
 .../CodeGen/AMDGPU/lds-limit-diagnostics.ll   | 32 +++++++++++++++++++
 3 files changed, 43 insertions(+), 18 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a05d4a644d08d1..e84fdf54866cdd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1192,7 +1192,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   "gfx9",
-  [FeatureFP64, FeatureAddressableLocalMemorySize65536,
+  [FeatureFP64,
    FeatureWavefrontSize64, FeatureFlatAddressSpace,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
@@ -1358,6 +1358,7 @@ def FeatureISAVersion8_1_0 : FeatureSet<
 
 def FeatureISAVersion9_0_Common : FeatureSet<
   [FeatureGFX9,
+   FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureImageInsts,
    FeatureMadMacF32Insts]>;
@@ -1375,7 +1376,8 @@ def FeatureISAVersion9_Generic : FeatureSet<
 
 def FeatureISAVersion9_0_MI_Common : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureFmaMixInsts,
+    [FeatureAddressableLocalMemorySize65536,
+     FeatureFmaMixInsts,
      FeatureDLInsts,
      FeatureDot1Insts,
      FeatureDot2Insts,
@@ -1491,16 +1493,17 @@ def FeatureISAVersion9_4_Common : FeatureSet<
 
 def FeatureISAVersion9_5_Common : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-  [FeatureFP8Insts,
+  [FeatureAddressableLocalMemorySize163840,
+   FeatureFP8Insts,
    FeatureFP8ConversionInsts,
    FeatureCvtFP8VOP1Bug,
    FeatureGFX950Insts,
-   FeatureAddressableLocalMemorySize163840
    ])>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureForceStoreSC0SC1,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
@@ -1511,6 +1514,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
 def FeatureISAVersion9_4_1 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureForceStoreSC0SC1,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
@@ -1521,6 +1525,7 @@ def FeatureISAVersion9_4_1 : FeatureSet<
 def FeatureISAVersion9_4_2 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
       FeatureCvtFP8VOP1Bug,
@@ -1529,7 +1534,8 @@ def FeatureISAVersion9_4_2 : FeatureSet<
 
 def FeatureISAVersion9_4_Generic : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-    [FeatureRequiresCOV6])>;
+    [FeatureAddressableLocalMemorySize65536,
+     FeatureRequiresCOV6])>;
 
 def FeatureISAVersion9_5_0 : FeatureSet<FeatureISAVersion9_5_Common.Features>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
deleted file mode 100644
index 19166b271db775..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-
-; GFX950 supports upto 160 KB LDS memory.
-; This is a negative test to check when the LDS size exceeds the max usable limit.
-
-; ERROR: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
- at dst = addrspace(3) global [40961 x i32] poison
-
-define amdgpu_kernel void @test_lds_limit(i32 %val) {
-  %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
-  store i32 %val, ptr addrspace(3) %gep
-  ret void
-}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll
new file mode 100644
index 00000000000000..73f6dcb3a2a1d2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT160K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT32K %s
+
+; gfx950 supports upto 160 KB LDS memory. The generic target does not.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR-LIMIT160K: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
+; ERROR-LIMIT64K: error: <unknown>:0:0: local memory (163844) exceeds limit (65536) in function 'test_lds_limit'
+; ERROR-LIMIT32K: error: <unknown>:0:0: local memory (163844) exceeds limit (32768) in function 'test_lds_limit'
+ at dst = addrspace(3) global [40961 x i32] poison
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}