[Mlir-commits] [mlir] 016e1eb - [mlir][gpu] Add metadata attributes for storing kernel metadata in GPU objects (#95292)

Tue Aug 27 15:44:53 PDT 2024

Author: Fabian Mora
Date: 2024-08-27T18:44:50-04:00
New Revision: 016e1eb9c86923bf6a9669697f6be8309d12b78c

URL: https://github.com/llvm/llvm-project/commit/016e1eb9c86923bf6a9669697f6be8309d12b78c
DIFF: https://github.com/llvm/llvm-project/commit/016e1eb9c86923bf6a9669697f6be8309d12b78c.diff

LOG: [mlir][gpu] Add metadata attributes for storing kernel metadata in GPU objects (#95292)

This patch adds the `#gpu.kernel_metadata` and `#gpu.kernel_table`
attributes. The `#gpu.kernel_metadata` attribute allows storing metadata
related to a compiled kernel, for example, the number of scalar
registers used by the kernel. The attribute only has 2 required
parameters, the name and function type. It also has 2 optional
parameters, the arguments attributes and generic dictionary for storing
all other metadata.

The `#gpu.kernel_table` stores a table of `#gpu.kernel_metadata`,
mapping the name of the kernel to the metadata.

Finally, the function `ROCDL::getAMDHSAKernelsELFMetadata` was added to
collect ELF metadata from a binary, and to test the class methods in
both attributes.

Example:
```mlir
gpu.binary @binary [#gpu.object<#rocdl.target<chip = "gfx900">, kernels = #gpu.kernel_table<[
    #gpu.kernel_metadata<"kernel0", (i32) -> (), metadata = {sgpr_count = 255}>,
    #gpu.kernel_metadata<"kernel1", (i32, f32) -> (), arg_attrs = [{llvm.read_only}, {}]>
  ]> , bin = "BLOB">]

```
The motivation behind these attributes is to provide useful information
for things like tunning.

---------

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>

Added: 
    mlir/lib/Target/LLVM/ROCDL/Utils.cpp

Modified: 
    mlir/include/mlir-c/Dialect/GPU.h
    mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
    mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
    mlir/lib/Bindings/Python/DialectGPU.cpp
    mlir/lib/CAPI/Dialect/GPU.cpp
    mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
    mlir/lib/Target/LLVM/CMakeLists.txt
    mlir/lib/Target/LLVM/NVVM/Target.cpp
    mlir/lib/Target/LLVM/ROCDL/Target.cpp
    mlir/lib/Target/SPIRV/Target.cpp
    mlir/test/Dialect/GPU/invalid.mlir
    mlir/test/Dialect/GPU/ops.mlir
    mlir/test/python/dialects/gpu/dialect.py
    mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
    mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir-c/Dialect/GPU.h b/mlir/include/mlir-c/Dialect/GPU.h
index c42ff61f9592c7..321c1122c33707 100644

--- a/mlir/include/mlir-c/Dialect/GPU.h
+++ b/mlir/include/mlir-c/Dialect/GPU.h
@@ -37,6 +37,11 @@ MLIR_CAPI_EXPORTED MlirAttribute
 mlirGPUObjectAttrGet(MlirContext mlirCtx, MlirAttribute target, uint32_t format,
                      MlirStringRef objectStrRef, MlirAttribute mlirObjectProps);
 
+MLIR_CAPI_EXPORTED MlirAttribute mlirGPUObjectAttrGetWithKernels(
+    MlirContext mlirCtx, MlirAttribute target, uint32_t format,
+    MlirStringRef objectStrRef, MlirAttribute mlirObjectProps,
+    MlirAttribute mlirKernelsAttr);
+
 MLIR_CAPI_EXPORTED MlirAttribute
 mlirGPUObjectAttrGetTarget(MlirAttribute mlirObjectAttr);
 
@@ -52,6 +57,12 @@ mlirGPUObjectAttrHasProperties(MlirAttribute mlirObjectAttr);
 MLIR_CAPI_EXPORTED MlirAttribute
 mlirGPUObjectAttrGetProperties(MlirAttribute mlirObjectAttr);
 
+MLIR_CAPI_EXPORTED bool
+mlirGPUObjectAttrHasKernels(MlirAttribute mlirObjectAttr);
+
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirGPUObjectAttrGetKernels(MlirAttribute mlirObjectAttr);
+
 #ifdef __cplusplus
 }
 #endif

diff  --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 6659f4a2c58e82..07879a0dab07f4 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -16,6 +16,155 @@
 include "mlir/Dialect/GPU/IR/GPUBase.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 
+//===----------------------------------------------------------------------===//
+// GPU kernel metadata attribute
+//===----------------------------------------------------------------------===//
+
+def GPU_KernelMetadataAttr : GPU_Attr<"KernelMetadata", "kernel_metadata"> {
+  let description = [{
+    GPU attribute for storing metadata related to a compiled kernel. The
+    attribute contains the name and arguments type of the kernel.
+
+    The attribute also contains optional parameters for storing the arguments
+    attributes as well as a dictionary for additional metadata, like occupancy
+    information or other function attributes.
+
+    Note: The `arg_attrs` parameter is expected to follow all the constraints
+    imposed by the `mlir::FunctionOpInterface` interface.
+
+    Examples:
+    ```mlir
+      #gpu.kernel_metadata<@kernel1, (i32) -> (), arg_attrs = [...],  metadata = {reg_count = 255, ...}>
+      #gpu.kernel_metadata<@kernel2, (i32, f64) -> ()>
+    ```
+  }];
+  let parameters = (ins
+    "StringAttr":$name,
+    "Type":$function_type,
+    OptionalParameter<"ArrayAttr", "arguments attributes">:$arg_attrs,
+    OptionalParameter<"DictionaryAttr", "metadata dictionary">:$metadata
+  );
+  let assemblyFormat = [{
+    `<` $name `,` $function_type (`,` struct($arg_attrs, $metadata)^)? `>`
+  }];
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "StringAttr":$name,
+                                        "Type":$functionType,
+                                        CArg<"ArrayAttr", "nullptr">:$argAttrs,
+                                        CArg<"DictionaryAttr",
+                                             "nullptr">:$metadata), [{
+      assert(name && "invalid name");
+      return $_get(name.getContext(), name, functionType, argAttrs, metadata);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "FunctionOpInterface":$kernel,
+                                         CArg<"DictionaryAttr",
+                                              "nullptr">:$metadata)>
+  ];
+  let genVerifyDecl = 1;
+  let extraClassDeclaration = [{
+    /// Compare two kernels based on the name.
+    bool operator<(const KernelMetadataAttr& other) const {
+      return getName().getValue() < other.getName().getValue();
+    }
+
+    /// Returns the metadata attribute corresponding to `key` or `nullptr`
+    /// if missing.
+    Attribute getAttr(StringRef key) const {
+      DictionaryAttr attrs = getMetadata();
+      return attrs ? attrs.get(key) : nullptr;
+    }
+    template <typename ConcreteAttr>
+    ConcreteAttr getAttr(StringRef key) const {
+      return llvm::dyn_cast_or_null<ConcreteAttr>(getAttr(key));
+    }
+    Attribute getAttr(StringAttr key) const {
+      DictionaryAttr attrs = getMetadata();
+      return attrs ? attrs.get(key) : nullptr;
+    }
+    template <typename ConcreteAttr>
+    ConcreteAttr getAttr(StringAttr key) const {
+      return llvm::dyn_cast_or_null<ConcreteAttr>(getAttr(key));
+    }
+
+    /// Returns the attribute dictionary at position `index`.
+    DictionaryAttr getArgAttrDict(unsigned index) {
+      ArrayAttr argArray = getArgAttrs();
+      return argArray ? llvm::cast<DictionaryAttr>(argArray[index]) : nullptr;
+    }
+
+    /// Return the specified attribute, if present, for the argument at 'index',
+    /// null otherwise.
+    Attribute getArgAttr(unsigned index, StringAttr name) {
+      DictionaryAttr argDict = getArgAttrDict(index);
+      return argDict ? argDict.get(name) : nullptr;
+    }
+    Attribute getArgAttr(unsigned index, StringRef name) {
+      DictionaryAttr argDict = getArgAttrDict(index);
+      return argDict ? argDict.get(name) : nullptr;
+    }
+
+    /// Returns a new KernelMetadataAttr that contains `attrs` in the metadata dictionary.
+    KernelMetadataAttr appendMetadata(ArrayRef<NamedAttribute> attrs) const;
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// GPU kernel table attribute
+//===----------------------------------------------------------------------===//
+
+def GPU_KernelTableAttr : GPU_Attr<"KernelTable", "kernel_table"> {
+  let description = [{
+    GPU attribute representing a list of `#gpu.kernel_metadata` attributes. This
+    attribute supports searching kernels by name. All kernels in the table must
+    have an unique name.
+
+    Examples:
+    ```mlir
+      // Empty table.
+      #gpu.kernel_table<>
+
+      // Table with a single kernel.
+      #gpu.kernel_table<[#gpu.kernel_metadata<kernel0, () -> () >]>
+
+      // Table with multiple kernels.
+      #gpu.kernel_table<[
+        #gpu.kernel_metadata<"kernel0", (i32, f32) -> (), metadata = {sgpr_count = 255}>,
+        #gpu.kernel_metadata<"kernel1", (i32) -> ()>
+      ]>
+    ```
+  }];
+  let parameters = (ins
+    OptionalArrayRefParameter<"KernelMetadataAttr", "array of kernels">:$kernel_table
+  );
+  let assemblyFormat = [{
+    `<` (`[` qualified($kernel_table)^ `]`)? `>`
+  }];
+  let builders = [
+    AttrBuilder<(ins "ArrayRef<KernelMetadataAttr>":$kernels,
+                     CArg<"bool", "false">:$isSorted)>
+  ];
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let extraClassDeclaration = [{
+    llvm::ArrayRef<KernelMetadataAttr>::iterator begin() const {
+      return getKernelTable().begin();
+    }
+    llvm::ArrayRef<KernelMetadataAttr>::iterator end() const {
+      return getKernelTable().end();
+    }
+    size_t size() const {
+      return getKernelTable().size();
+    }
+    bool empty() const {
+      return getKernelTable().empty();
+    }
+
+    /// Returns the kernel with name `key` or `nullptr` if not present.
+    KernelMetadataAttr lookup(StringRef key) const;
+    KernelMetadataAttr lookup(StringAttr key) const;
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // GPU object attribute.
 //===----------------------------------------------------------------------===//
@@ -36,8 +185,9 @@ def GPU_CompilationTargetEnum : GPU_I32Enum<
 def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
   let description = [{
     A GPU object attribute glues together a GPU target, the object kind, a
-    binary string with the object, and the object properties, encapsulating how
-    the object was generated and its properties with the object itself.
+    binary string with the object, the object properties, and kernel metadata,
+    encapsulating how the object was generated and its properties with the
+    object itself.
 
     There are four object formats:
     1. `Offload`: represents generic objects not described by the other three
@@ -55,6 +205,10 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
 
     Object properties are specified through the `properties` dictionary
     attribute and can be used to define additional information.
+
+    Kernel metadata is specified through the `kernels` parameter, and can be
+    used to specify additional information on a kernel by kernel basis.
+
     The target attribute must implement or promise the `TargetAttrInterface`
     interface.
 
@@ -63,16 +217,29 @@ def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
       #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties.
       #gpu.object<#rocdl.target, bin = "..."> // A binary object.
       #gpu.object<#nvvm.target, "..."> // A fatbin object.
+      #gpu.object<#nvvm.target, kernels = #gpu.kernel_table<...>, "..."> // An object with a kernel table.
     ```
   }];
   let parameters = (ins
     "Attribute":$target,
     DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
     "StringAttr":$object,
-    OptionalParameter<"DictionaryAttr">:$properties
+    OptionalParameter<"DictionaryAttr">:$properties,
+    OptionalParameter<"KernelTableAttr">:$kernels
   );
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "Attribute":$target,
+                                        "CompilationTarget":$format,
+                                        "StringAttr":$object,
+                                        CArg<"DictionaryAttr", "nullptr">:$properties,
+                                        CArg<"KernelTableAttr", "nullptr">:$kernels), [{
+      assert(target && "invalid target");
+      return $_get(target.getContext(), target, format, object, properties, kernels);
+    }]>
+  ];
   let assemblyFormat = [{ `<`
-      $target `,`  (`properties` `=` $properties ^ `,`)?
+      $target `,`  (`properties` `=` $properties^ `,`)?
+      (`kernels` `=` $kernels^ `,`)?
       custom<Object>($format, $object)
     `>`
   }];

diff  --git a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
index 3c637a01b0e3be..3d2174c144815b 100644
--- a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
@@ -14,6 +14,7 @@
 #define MLIR_TARGET_LLVM_ROCDL_UTILS_H
 
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVM/ModuleToObject.h"
@@ -107,6 +108,20 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   /// AMD GCN libraries to use when linking, the default is using none.
   AMDGCNLibraries deviceLibs = AMDGCNLibraries::None;
 };
+
+/// Returns a map containing the `amdhsa.kernels` ELF metadata for each of the
+/// kernels in the binary, or `std::nullopt` if the metadata couldn't be
+/// retrieved. The map associates the name of the kernel with the list of named
+/// attributes found in `amdhsa.kernels`. For more information on the ELF
+/// metadata see: https://llvm.org/docs/AMDGPUUsage.html#amdhsa
+std::optional<DenseMap<StringAttr, NamedAttrList>>
+getAMDHSAKernelsELFMetadata(Builder &builder, ArrayRef<char> elfData);
+
+/// Returns a `#gpu.kernel_table` containing kernel metadata for each of the
+/// kernels in `gpuModule`. If `elfData` is valid, then the `amdhsa.kernels` ELF
+/// metadata will be added to the `#gpu.kernel_table`.
+gpu::KernelTableAttr getKernelMetadata(Operation *gpuModule,
+                                       ArrayRef<char> elfData = {});
 } // namespace ROCDL
 } // namespace mlir
 

diff  --git a/mlir/lib/Bindings/Python/DialectGPU.cpp b/mlir/lib/Bindings/Python/DialectGPU.cpp
index a9e339b50dabc6..560a54bcd15919 100644
--- a/mlir/lib/Bindings/Python/DialectGPU.cpp
+++ b/mlir/lib/Bindings/Python/DialectGPU.cpp
@@ -48,17 +48,21 @@ PYBIND11_MODULE(_mlirDialectsGPU, m) {
       .def_classmethod(
           "get",
           [](py::object cls, MlirAttribute target, uint32_t format,
-             py::bytes object, std::optional<MlirAttribute> mlirObjectProps) {
+             py::bytes object, std::optional<MlirAttribute> mlirObjectProps,
+             std::optional<MlirAttribute> mlirKernelsAttr) {
             py::buffer_info info(py::buffer(object).request());
             MlirStringRef objectStrRef =
                 mlirStringRefCreate(static_cast<char *>(info.ptr), info.size);
-            return cls(mlirGPUObjectAttrGet(
+            return cls(mlirGPUObjectAttrGetWithKernels(
                 mlirAttributeGetContext(target), target, format, objectStrRef,
                 mlirObjectProps.has_value() ? *mlirObjectProps
+                                            : MlirAttribute{nullptr},
+                mlirKernelsAttr.has_value() ? *mlirKernelsAttr
                                             : MlirAttribute{nullptr}));
           },
           "cls"_a, "target"_a, "format"_a, "object"_a,
-          "properties"_a = py::none(), "Gets a gpu.object from parameters.")
+          "properties"_a = py::none(), "kernels"_a = py::none(),
+          "Gets a gpu.object from parameters.")
       .def_property_readonly(
           "target",
           [](MlirAttribute self) { return mlirGPUObjectAttrGetTarget(self); })
@@ -71,9 +75,16 @@ PYBIND11_MODULE(_mlirDialectsGPU, m) {
             MlirStringRef stringRef = mlirGPUObjectAttrGetObject(self);
             return py::bytes(stringRef.data, stringRef.length);
           })
-      .def_property_readonly("properties", [](MlirAttribute self) {
-        if (mlirGPUObjectAttrHasProperties(self))
-          return py::cast(mlirGPUObjectAttrGetProperties(self));
+      .def_property_readonly("properties",
+                             [](MlirAttribute self) {
+                               if (mlirGPUObjectAttrHasProperties(self))
+                                 return py::cast(
+                                     mlirGPUObjectAttrGetProperties(self));
+                               return py::none().cast<py::object>();
+                             })
+      .def_property_readonly("kernels", [](MlirAttribute self) {
+        if (mlirGPUObjectAttrHasKernels(self))
+          return py::cast(mlirGPUObjectAttrGetKernels(self));
         return py::none().cast<py::object>();
       });
 }

diff  --git a/mlir/lib/CAPI/Dialect/GPU.cpp b/mlir/lib/CAPI/Dialect/GPU.cpp
index 0acebb23004291..e4796ed1499ea1 100644
--- a/mlir/lib/CAPI/Dialect/GPU.cpp
+++ b/mlir/lib/CAPI/Dialect/GPU.cpp
@@ -43,9 +43,28 @@ MlirAttribute mlirGPUObjectAttrGet(MlirContext mlirCtx, MlirAttribute target,
   DictionaryAttr objectProps;
   if (mlirObjectProps.ptr != nullptr)
     objectProps = llvm::cast<DictionaryAttr>(unwrap(mlirObjectProps));
-  return wrap(gpu::ObjectAttr::get(ctx, unwrap(target),
-                                   static_cast<gpu::CompilationTarget>(format),
-                                   StringAttr::get(ctx, object), objectProps));
+  return wrap(gpu::ObjectAttr::get(
+      ctx, unwrap(target), static_cast<gpu::CompilationTarget>(format),
+      StringAttr::get(ctx, object), objectProps, nullptr));
+}
+
+MlirAttribute mlirGPUObjectAttrGetWithKernels(MlirContext mlirCtx,
+                                              MlirAttribute target,
+                                              uint32_t format,
+                                              MlirStringRef objectStrRef,
+                                              MlirAttribute mlirObjectProps,
+                                              MlirAttribute mlirKernelsAttr) {
+  MLIRContext *ctx = unwrap(mlirCtx);
+  llvm::StringRef object = unwrap(objectStrRef);
+  DictionaryAttr objectProps;
+  if (mlirObjectProps.ptr != nullptr)
+    objectProps = llvm::cast<DictionaryAttr>(unwrap(mlirObjectProps));
+  gpu::KernelTableAttr kernels;
+  if (mlirKernelsAttr.ptr != nullptr)
+    kernels = llvm::cast<gpu::KernelTableAttr>(unwrap(mlirKernelsAttr));
+  return wrap(gpu::ObjectAttr::get(
+      ctx, unwrap(target), static_cast<gpu::CompilationTarget>(format),
+      StringAttr::get(ctx, object), objectProps, kernels));
 }
 
 MlirAttribute mlirGPUObjectAttrGetTarget(MlirAttribute mlirObjectAttr) {
@@ -78,3 +97,15 @@ MlirAttribute mlirGPUObjectAttrGetProperties(MlirAttribute mlirObjectAttr) {
       llvm::cast<gpu::ObjectAttr>(unwrap(mlirObjectAttr));
   return wrap(objectAttr.getProperties());
 }
+
+bool mlirGPUObjectAttrHasKernels(MlirAttribute mlirObjectAttr) {
+  gpu::ObjectAttr objectAttr =
+      llvm::cast<gpu::ObjectAttr>(unwrap(mlirObjectAttr));
+  return objectAttr.getKernels() != nullptr;
+}
+
+MlirAttribute mlirGPUObjectAttrGetKernels(MlirAttribute mlirObjectAttr) {
+  gpu::ObjectAttr objectAttr =
+      llvm::cast<gpu::ObjectAttr>(unwrap(mlirObjectAttr));
+  return wrap(objectAttr.getKernels());
+}

diff  --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a59952228ef6ea..e45ba7838b453c 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2091,7 +2091,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                                  Attribute target, CompilationTarget format,
-                                 StringAttr object, DictionaryAttr properties) {
+                                 StringAttr object, DictionaryAttr properties,
+                                 KernelTableAttr kernels) {
   if (!target)
     return emitError() << "the target attribute cannot be null";
   if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -2177,6 +2178,113 @@ LogicalResult gpu::DynamicSharedMemoryOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GPU KernelMetadataAttr
+//===----------------------------------------------------------------------===//
+
+KernelMetadataAttr KernelMetadataAttr::get(FunctionOpInterface kernel,
+                                           DictionaryAttr metadata) {
+  assert(kernel && "invalid kernel");
+  return get(kernel.getNameAttr(), kernel.getFunctionType(),
+             kernel.getAllArgAttrs(), metadata);
+}
+
+KernelMetadataAttr
+KernelMetadataAttr::getChecked(function_ref<InFlightDiagnostic()> emitError,
+                               FunctionOpInterface kernel,
+                               DictionaryAttr metadata) {
+  assert(kernel && "invalid kernel");
+  return getChecked(emitError, kernel.getNameAttr(), kernel.getFunctionType(),
+                    kernel.getAllArgAttrs(), metadata);
+}
+
+KernelMetadataAttr
+KernelMetadataAttr::appendMetadata(ArrayRef<NamedAttribute> attrs) const {
+  if (attrs.empty())
+    return *this;
+  NamedAttrList attrList;
+  if (DictionaryAttr dict = getMetadata())
+    attrList.append(dict);
+  attrList.append(attrs);
+  return KernelMetadataAttr::get(getName(), getFunctionType(), getArgAttrs(),
+                                 attrList.getDictionary(getContext()));
+}
+
+LogicalResult
+KernelMetadataAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                           StringAttr name, Type functionType,
+                           ArrayAttr argAttrs, DictionaryAttr metadata) {
+  if (name.empty())
+    return emitError() << "the kernel name can't be empty";
+  if (argAttrs) {
+    if (llvm::any_of(argAttrs, [](Attribute attr) {
+          return !llvm::isa<DictionaryAttr>(attr);
+        }))
+      return emitError()
+             << "all attributes in the array must be a dictionary attribute";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// GPU KernelTableAttr
+//===----------------------------------------------------------------------===//
+
+KernelTableAttr KernelTableAttr::get(MLIRContext *context,
+                                     ArrayRef<KernelMetadataAttr> kernels,
+                                     bool isSorted) {
+  // Note that `is_sorted` is always only invoked once even with assertions ON.
+  assert((!isSorted || llvm::is_sorted(kernels)) &&
+         "expected a sorted kernel array");
+  // Immediately return the attribute if the array is sorted.
+  if (isSorted || llvm::is_sorted(kernels))
+    return Base::get(context, kernels);
+  // Sort the array.
+  SmallVector<KernelMetadataAttr> kernelsTmp(kernels);
+  llvm::array_pod_sort(kernelsTmp.begin(), kernelsTmp.end());
+  return Base::get(context, kernelsTmp);
+}
+
+KernelTableAttr KernelTableAttr::getChecked(
+    function_ref<InFlightDiagnostic()> emitError, MLIRContext *context,
+    ArrayRef<KernelMetadataAttr> kernels, bool isSorted) {
+  // Note that `is_sorted` is always only invoked once even with assertions ON.
+  assert((!isSorted || llvm::is_sorted(kernels)) &&
+         "expected a sorted kernel array");
+  // Immediately return the attribute if the array is sorted.
+  if (isSorted || llvm::is_sorted(kernels))
+    return Base::getChecked(emitError, context, kernels);
+  // Sort the array.
+  SmallVector<KernelMetadataAttr> kernelsTmp(kernels);
+  llvm::array_pod_sort(kernelsTmp.begin(), kernelsTmp.end());
+  return Base::getChecked(emitError, context, kernelsTmp);
+}
+
+LogicalResult
+KernelTableAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                        ArrayRef<KernelMetadataAttr> kernels) {
+  if (kernels.size() < 2)
+    return success();
+  // Check that the kernels are uniquely named.
+  if (std::adjacent_find(kernels.begin(), kernels.end(),
+                         [](KernelMetadataAttr l, KernelMetadataAttr r) {
+                           return l.getName() == r.getName();
+                         }) != kernels.end()) {
+    return emitError() << "expected all kernels to be uniquely named";
+  }
+  return success();
+}
+
+KernelMetadataAttr KernelTableAttr::lookup(StringRef key) const {
+  auto [iterator, found] = impl::findAttrSorted(begin(), end(), key);
+  return found ? *iterator : KernelMetadataAttr();
+}
+
+KernelMetadataAttr KernelTableAttr::lookup(StringAttr key) const {
+  auto [iterator, found] = impl::findAttrSorted(begin(), end(), key);
+  return found ? *iterator : KernelMetadataAttr();
+}
+
 //===----------------------------------------------------------------------===//
 // GPU target options
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 93dc5ff9d35b74..bc14c568e46be2 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -110,10 +110,12 @@ endif()
 
 add_mlir_dialect_library(MLIRROCDLTarget
   ROCDL/Target.cpp
+  ROCDL/Utils.cpp
 
   OBJECT
 
   LINK_COMPONENTS
+  FrontendOffloading
   MCParser
   ${AMDGPU_LIBS}
 

diff  --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index a75b7f92ed8dc3..806c405ac17dfa 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -604,5 +604,5 @@ NVVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
   return builder.getAttr<gpu::ObjectAttr>(
       attribute, format,
       builder.getStringAttr(StringRef(object.data(), object.size())),
-      objectProps);
+      objectProps, /*kernels=*/nullptr);
 }

diff  --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index e32a0c7e14e85c..d8a79a7e80d643 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -506,13 +506,15 @@ ROCDLTargetAttrImpl::createObject(Attribute attribute, Operation *module,
   gpu::CompilationTarget format = options.getCompilationTarget();
   // If format is `fatbin` transform it to binary as `fatbin` is not yet
   // supported.
-  if (format > gpu::CompilationTarget::Binary)
+  gpu::KernelTableAttr kernels;
+  if (format > gpu::CompilationTarget::Binary) {
     format = gpu::CompilationTarget::Binary;
-
+    kernels = ROCDL::getKernelMetadata(module, object);
+  }
   DictionaryAttr properties{};
   Builder builder(attribute.getContext());
-  return builder.getAttr<gpu::ObjectAttr>(
-      attribute, format,
-      builder.getStringAttr(StringRef(object.data(), object.size())),
-      properties);
+  StringAttr objectStr =
+      builder.getStringAttr(StringRef(object.data(), object.size()));
+  return builder.getAttr<gpu::ObjectAttr>(attribute, format, objectStr,
+                                          properties, kernels);
 }

diff  --git a/mlir/lib/Target/LLVM/ROCDL/Utils.cpp b/mlir/lib/Target/LLVM/ROCDL/Utils.cpp
new file mode 100644
index 00000000000000..04b1b22279e5db
--- /dev/null
+++ b/mlir/lib/Target/LLVM/ROCDL/Utils.cpp
@@ -0,0 +1,87 @@
+//===- Utils.cpp - MLIR ROCDL target utils ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files defines ROCDL target related utility classes and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/ROCDL/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+
+using namespace mlir;
+using namespace mlir::ROCDL;
+
+std::optional<DenseMap<StringAttr, NamedAttrList>>
+mlir::ROCDL::getAMDHSAKernelsELFMetadata(Builder &builder,
+                                         ArrayRef<char> elfData) {
+  uint16_t elfABIVersion;
+  llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> kernels;
+  llvm::MemoryBufferRef buffer(StringRef(elfData.data(), elfData.size()),
+                               "buffer");
+  // Get the metadata.
+  llvm::Error error = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+      buffer, kernels, elfABIVersion);
+  // Return `nullopt` if the metadata couldn't be retrieved.
+  if (error) {
+    llvm::consumeError(std::move(error));
+    return std::nullopt;
+  }
+  // Helper lambda for converting values.
+  auto getI32Array = [&builder](const uint32_t *array) {
+    return builder.getDenseI32ArrayAttr({static_cast<int32_t>(array[0]),
+                                         static_cast<int32_t>(array[1]),
+                                         static_cast<int32_t>(array[2])});
+  };
+  DenseMap<StringAttr, NamedAttrList> kernelMD;
+  for (const auto &[name, kernel] : kernels) {
+    NamedAttrList attrs;
+    // Add kernel metadata.
+    attrs.append("agpr_count", builder.getI64IntegerAttr(kernel.AGPRCount));
+    attrs.append("sgpr_count", builder.getI64IntegerAttr(kernel.SGPRCount));
+    attrs.append("vgpr_count", builder.getI64IntegerAttr(kernel.VGPRCount));
+    attrs.append("sgpr_spill_count",
+                 builder.getI64IntegerAttr(kernel.SGPRSpillCount));
+    attrs.append("vgpr_spill_count",
+                 builder.getI64IntegerAttr(kernel.VGPRSpillCount));
+    attrs.append("wavefront_size",
+                 builder.getI64IntegerAttr(kernel.WavefrontSize));
+    attrs.append("max_flat_workgroup_size",
+                 builder.getI64IntegerAttr(kernel.MaxFlatWorkgroupSize));
+    attrs.append("group_segment_fixed_size",
+                 builder.getI64IntegerAttr(kernel.GroupSegmentList));
+    attrs.append("private_segment_fixed_size",
+                 builder.getI64IntegerAttr(kernel.PrivateSegmentSize));
+    attrs.append("reqd_workgroup_size",
+                 getI32Array(kernel.RequestedWorkgroupSize));
+    attrs.append("workgroup_size_hint", getI32Array(kernel.WorkgroupSizeHint));
+    kernelMD[builder.getStringAttr(name)] = std::move(attrs);
+  }
+  return std::move(kernelMD);
+}
+
+gpu::KernelTableAttr mlir::ROCDL::getKernelMetadata(Operation *gpuModule,
+                                                    ArrayRef<char> elfData) {
+  auto module = cast<gpu::GPUModuleOp>(gpuModule);
+  Builder builder(module.getContext());
+  SmallVector<gpu::KernelMetadataAttr> kernels;
+  std::optional<DenseMap<StringAttr, NamedAttrList>> mdMapOrNull =
+      getAMDHSAKernelsELFMetadata(builder, elfData);
+  for (auto funcOp : module.getBody()->getOps<LLVM::LLVMFuncOp>()) {
+    if (!funcOp->getDiscardableAttr("rocdl.kernel"))
+      continue;
+    kernels.push_back(gpu::KernelMetadataAttr::get(
+        funcOp, mdMapOrNull ? builder.getDictionaryAttr(
+                                  mdMapOrNull->lookup(funcOp.getNameAttr()))
+                            : nullptr));
+  }
+  return gpu::KernelTableAttr::get(gpuModule->getContext(), kernels);
+}

diff  --git a/mlir/lib/Target/SPIRV/Target.cpp b/mlir/lib/Target/SPIRV/Target.cpp
index d48548bf9709c0..dd128e254aa0d6 100644
--- a/mlir/lib/Target/SPIRV/Target.cpp
+++ b/mlir/lib/Target/SPIRV/Target.cpp
@@ -98,5 +98,5 @@ SPIRVTargetAttrImpl::createObject(Attribute attribute, Operation *module,
   return builder.getAttr<gpu::ObjectAttr>(
       attribute, format,
       builder.getStringAttr(StringRef(object.data(), object.size())),
-      objectProps);
+      objectProps, /*kernels=*/nullptr);
 }

diff  --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 20c1c4cf8a2d0b..fd7618020b5d89 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -848,3 +848,15 @@ module attributes {gpu.container_module} {
   gpu.module @kernel <> {
   }
 }
+
+// -----
+
+gpu.binary @binary [#gpu.object<#rocdl.target<chip = "gfx900">,
+  // expected-error at +1{{expected all kernels to be uniquely named}}
+    kernels = #gpu.kernel_table<[
+      #gpu.kernel_metadata<"kernel", (i32) -> ()>,
+      #gpu.kernel_metadata<"kernel", (i32, f32) -> (), metadata = {sgpr_count = 255}>
+  // expected-error at below{{failed to parse GPU_ObjectAttr parameter 'kernels' which is to be a `KernelTableAttr`}}
+    ]>,
+    bin = "BLOB">
+  ]

diff  --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index ba7897f4e80cb5..b9c0a0e79e8f2a 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -441,3 +441,26 @@ gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">
 
 gpu.module @module_with_offload_handler <#gpu.select_object<0>> [#nvvm.target] {
 }
+
+// Test kernel attributes
+gpu.binary @kernel_attrs_1 [
+    #gpu.object<#rocdl.target<chip = "gfx900">,
+      kernels = #gpu.kernel_table<[
+        #gpu.kernel_metadata<"kernel0", (i32, f32) -> (), metadata = {sgpr_count = 255}>,
+        #gpu.kernel_metadata<"kernel1", (i32) -> (), arg_attrs = [{llvm.read_only}]>
+      ]>,
+      bin = "BLOB">
+  ]
+
+// Verify the kernels are sorted
+// CHECK-LABEL: gpu.binary @kernel_attrs_2
+gpu.binary @kernel_attrs_2 [
+    // CHECK: [#gpu.kernel_metadata<"a_kernel", () -> ()>, #gpu.kernel_metadata<"m_kernel", () -> ()>, #gpu.kernel_metadata<"z_kernel", () -> ()>]
+    #gpu.object<#rocdl.target<chip = "gfx900">,
+      kernels = #gpu.kernel_table<[
+        #gpu.kernel_metadata<"z_kernel", () -> ()>,
+        #gpu.kernel_metadata<"m_kernel", () -> ()>,
+        #gpu.kernel_metadata<"a_kernel", () -> ()>
+      ]>,
+      bin = "BLOB">
+  ]

diff  --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index aded35b04aa1ea..26ee9f34cb3326 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -55,3 +55,12 @@ def testObjectAttr():
     # CHECK: #gpu.object<#nvvm.target, "//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 6.0\0A.target sm_50">
     print(o)
     assert o.object == object
+
+    object = b"BC\xc0\xde5\x14\x00\x00\x05\x00\x00\x00b\x0c0$MY\xbef"
+    kernelTable = Attribute.parse(
+        '#gpu.kernel_table<[#gpu.kernel_metadata<"kernel", () -> ()>]>'
+    )
+    o = gpu.ObjectAttr.get(target, format, object, kernels=kernelTable)
+    # CHECK: #gpu.object<#nvvm.target, kernels = <[#gpu.kernel_metadata<"kernel", () -> ()>]>, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
+    print(o)
+    assert o.kernels == kernelTable

diff  --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 33291bc4bcaed9..43fa3d850d9e29 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -158,3 +158,69 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToBinary)) {
     ASSERT_FALSE(object->empty());
   }
 }
+
+// Test ROCDL metadata.
+TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(GetELFMetadata)) {
+  if (!hasROCMTools())
+    GTEST_SKIP() << "ROCm installation not found, skipping test.";
+
+  MLIRContext context(registry);
+
+  // MLIR module used for the tests.
+  const std::string moduleStr = R"mlir(
+    gpu.module @rocdl_test {
+    llvm.func @rocdl_kernel_1(%arg0: f32) attributes {gpu.kernel, rocdl.kernel} {
+      llvm.return
+    }
+    llvm.func @rocdl_kernel_0(%arg0: f32) attributes {gpu.kernel, rocdl.kernel} {
+      llvm.return
+    }
+    llvm.func @rocdl_kernel_2(%arg0: f32) attributes {gpu.kernel, rocdl.kernel} {
+      llvm.return
+    }
+    llvm.func @a_kernel(%arg0: f32) attributes {gpu.kernel, rocdl.kernel} {
+      llvm.return
+    }
+  })mlir";
+
+  OwningOpRef<ModuleOp> module =
+      parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+
+  // Create a ROCDL target.
+  ROCDL::ROCDLTargetAttr target = ROCDL::ROCDLTargetAttr::get(&context);
+
+  // Serialize the module.
+  auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
+  ASSERT_TRUE(!!serializer);
+  gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary);
+  for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
+    std::optional<SmallVector<char, 0>> object =
+        serializer.serializeToObject(gpuModule, options);
+    // Check that the serializer was successful.
+    ASSERT_TRUE(object != std::nullopt);
+    ASSERT_FALSE(object->empty());
+    if (!object)
+      continue;
+    // Get the metadata.
+    gpu::KernelTableAttr metadata =
+        ROCDL::getKernelMetadata(gpuModule, *object);
+    ASSERT_TRUE(metadata != nullptr);
+    // There should be 4 kernels.
+    ASSERT_TRUE(metadata.size() == 4);
+    // Check that the lookup method returns finds the kernel.
+    ASSERT_TRUE(metadata.lookup("a_kernel") != nullptr);
+    ASSERT_TRUE(metadata.lookup("rocdl_kernel_0") != nullptr);
+    // Check that the kernel doesn't exist.
+    ASSERT_TRUE(metadata.lookup("not_existent_kernel") == nullptr);
+    // Test the `KernelMetadataAttr` iterators.
+    for (gpu::KernelMetadataAttr kernel : metadata) {
+      // Check that the ELF metadata is present.
+      ASSERT_TRUE(kernel.getMetadata() != nullptr);
+      // Verify that `sgpr_count` is present and it is an integer attribute.
+      ASSERT_TRUE(kernel.getAttr<IntegerAttr>("sgpr_count") != nullptr);
+      // Verify that `vgpr_count` is present and it is an integer attribute.
+      ASSERT_TRUE(kernel.getAttr<IntegerAttr>("vgpr_count") != nullptr);
+    }
+  }
+}

diff  --git a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
index 37dbfe62036871..aaa281e07933b5 100644
--- a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
@@ -116,7 +116,7 @@ TargetAttrImpl::createObject(Attribute attribute, Operation *module,
       module->getContext(), attribute, gpu::CompilationTarget::Offload,
       StringAttr::get(module->getContext(),
                       StringRef(object.data(), object.size())),
-      module->getAttrDictionary());
+      module->getAttrDictionary(), /*kernels=*/nullptr);
 }
 
 TEST_F(MLIRTargetLLVM, SKIP_WITHOUT_NATIVE(TargetAttrAPI)) {