[llvm] 9803de0 - [AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5 (#65273)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 4 05:35:16 PST 2024


Author: Chaitanya
Date: 2024-01-04T19:05:12+05:30
New Revision: 9803de0e8e3abbbc94a4265d5847db435897a384

URL: https://github.com/llvm/llvm-project/commit/9803de0e8e3abbbc94a4265d5847db435897a384
DIFF: https://github.com/llvm/llvm-project/commit/9803de0e8e3abbbc94a4265d5847db435897a384.diff

LOG: [AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5 (#65273)

"hidden_dynamic_lds_size" argument will be added in the reserved section
at offset 120 of the implicit argument layout.
Add "isDynamicLDSUsed" flag to AMDGPUMachineFunction to identify if a
function uses dynamic LDS.

hidden argument will be added in below cases:

- LDS global is used in the kernel.
- Kernel calls a function which uses LDS global.
- LDS pointer is passed as argument to kernel itself.

Added: 
    llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
    llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 371e583c22a835..e05f7fc3e76627 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4114,6 +4114,9 @@ Code object V5 metadata is the same as
                                                        buffer that conforms to the requirements of the malloc/free
                                                        device library V1 version implementation.
 
+                                                     "hidden_dynamic_lds_size"
+                                                       Size of the dynamically allocated LDS memory is passed in the kernarg.
+
                                                      "hidden_private_base"
                                                        The high 32 bits of the flat addressing private aperture base.
                                                        Only used by GFX8 to allow conversion between private segment

diff  --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index dda3380c04ea9b..33eed07c46292f 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
                                .Case("hidden_default_queue", true)
                                .Case("hidden_completion_action", true)
                                .Case("hidden_multigrid_sync_arg", true)
+                               .Case("hidden_dynamic_lds_size", true)
                                .Case("hidden_private_base", true)
                                .Case("hidden_shared_base", true)
                                .Case("hidden_queue_ptr", true)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b..74e9cd7d09654c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
     Offset += 8; // Skipped.
   }
 
-  Offset += 72; // Reserved.
+  // Emit argument for hidden dynamic lds size
+  if (MFI.isDynamicLDSUsed()) {
+    emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+                  Args);
+  } else {
+    Offset += 4; // skipped
+  }
+
+  Offset += 68; // Reserved.
 
   // hidden_private_base and hidden_shared_base are only when the subtarget has
   // ApertureRegs.

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 323462e60a29fa..31777295b4f8fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,6 +19,26 @@
 
 using namespace llvm;
 
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  SmallString<64> KernelDynLDSName("llvm.amdgcn.");
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
+static bool hasLDSKernelArgument(const Function &F) {
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
+      if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+        return true;
+    }
+  }
+  return false;
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
                                              const AMDGPUSubtarget &ST)
     : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
   Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
   NoSignedZerosFPMath =
       NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
+
+  const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
+  if (DynLdsGlobal || hasLDSKernelArgument(F))
+    UsesDynamicLDS = true;
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
-static const GlobalVariable *
-getKernelDynLDSGlobalFromFunction(const Function &F) {
-  const Module *M = F.getParent();
-  std::string KernelDynLDSName = "llvm.amdgcn.";
-  KernelDynLDSName += F.getName();
-  KernelDynLDSName += ".dynlds";
-  return M->getNamedGlobal(KernelDynLDSName);
-}
-
 std::optional<uint32_t>
 AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
   // TODO: Would be more consistent with the abs symbols to use a range
@@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
     }
   }
 }
+
+void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
+  UsesDynamicLDS = DynLDS;
+}
+
+bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 248ee26a47eb1d..7efb7f825348e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// stages.
   Align DynLDSAlign;
 
+  // Flag to check dynamic LDS usage by kernel.
+  bool UsesDynamicLDS = false;
+
   // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
   void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
+
+  void setUsesDynamicLDS(bool DynLDS);
+
+  bool isDynamicLDSUsed() const;
 };
 
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0e857e6ac71b61..b481ae43e8215c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6890,6 +6890,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
         // Adjust alignment for that dynamic shared memory array.
         Function &F = DAG.getMachineFunction().getFunction();
         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+        MFI->setUsesDynamicLDS(true);
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
new file mode 100644
index 00000000000000..cb15ff9fcb1bce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          224
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds() {
+  store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  call void @funcs_dyn_lds()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
new file mode 100644
index 00000000000000..16bfe5f0196835
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          224
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
+
+define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
+  store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  call void @funcs_dyn_lds(ptr addrspace(3) @lds)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
new file mode 100644
index 00000000000000..d457c61b8d4081
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:	amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  local
+; CHECK-NEXT:         .name:           lds_ptr
+; CHECK-NEXT:         .offset:         24
+; CHECK-NEXT:         .pointee_align:  1
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:         .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       - .offset:         32
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         48
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         50
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         52
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         54
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         96
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .offset:         120
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_heap_v1
+; CHECK-NEXT:      - .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .offset:         144
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          152
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
+; CHECK:          - .offset:          232
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+
+define amdgpu_kernel void @test_v5(
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(3) %lds_ptr) #0 {
+entry:
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, ptr addrspace(1) %r
+  store i32 1234, ptr addrspace(3) %lds_ptr, align 4
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!llvm.printf.fmts = !{!1, !2}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline }
+

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
index cb30d668674c31..1a2ce636c733c5 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
@@ -81,13 +81,16 @@
 ; CHECK-NEXT:      - .offset:         136
 ; CHECK-NEXT:        .size:           8
 ; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; CHECK:          - .offset:          144
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_dynamic_lds_size
 ; GFX8-NEXT:      - .offset:         216
 ; GFX8-NEXT:        .size:           4
 ; GFX8-NEXT:        .value_kind:     hidden_private_base
 ; GFX8-NEXT:      - .offset:         220
 ; GFX8-NEXT:        .size:           4
 ; GFX8-NEXT:        .value_kind:     hidden_shared_base
-; CHECK:      - .offset:         224
+; CHECK:          - .offset:          224
 ; CHECK-NEXT:        .size:           8
 ; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
 
@@ -97,6 +100,7 @@
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
 ; CHECK-NEXT: - 2
+ at lds = external hidden addrspace(3) global [0 x i32], align 4
 define amdgpu_kernel void @test_v5(
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -106,6 +110,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
   store half %r.val, ptr addrspace(1) %r
+  store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
   ret void
 }
 


        


More information about the llvm-commits mailing list