[Mlir-commits] [mlir] [ROCDL] Use attached target on the GPU module when lowering GPU ops to ROCDL (PR #110735)

Fri Oct 4 07:32:18 PDT 2024

https://github.com/umangyadav updated https://github.com/llvm/llvm-project/pull/110735

>From ba8efb5c2c295ce7cabf4147c4b79d72401ded20 Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav at amd.com>
Date: Tue, 1 Oct 2024 20:08:46 +0000
Subject: [PATCH 1/3] Use attached target on the GPU module when lowering GPU
 ops to ROCDL

---
 .../Conversion/GPUToROCDL/GPUToROCDLPass.h    |  2 +-
 mlir/include/mlir/Conversion/Passes.td        |  4 +--
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 29 +++++++++++++++----
 .../GPUCommon/lower-memory-space-attrs.mlir   |  2 +-
 .../GPUCommon/memory-attrbution.mlir          |  2 +-
 .../GPUCommon/memref-arg-attrs.mlir           |  2 +-
 .../GPUCommon/memref-arg-noalias-attrs.mlir   |  2 +-
 .../GPUCommon/memref-arg-noalias-warning.mlir |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-hip.mlir          |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-opencl.mlir       |  2 +-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   |  4 +--
 mlir/test/Conversion/GPUToROCDL/memref.mlir   |  4 +--
 .../Integration/GPU/ROCM/gpu-to-hsaco.mlir    |  2 +-
 mlir/test/Integration/GPU/ROCM/printf.mlir    |  2 +-
 .../Integration/GPU/ROCM/two-modules.mlir     |  2 +-
 mlir/test/Integration/GPU/ROCM/vecadd.mlir    |  2 +-
 .../GPU/ROCM/vector-transferops.mlir          |  2 +-
 17 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 5647787712997b..f1233ad894daf5 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -42,7 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
-    const std::string &chipset = "gfx900",
+    const std::string &chipset = "infer",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
     gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 4d272ba219c6f1..d536ee25a75637 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -592,8 +592,8 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   ];
   let options = [
     Option<"chipset", "chipset", "std::string",
-           /*default=*/"\"gfx000\"",
-           "Chipset that these operations will run on">,
+           /*default=*/"\"infer\"",
+           "Chipset that these operations will run on. By Default it will infer target from attached target attribute on GPU module on which it operates">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 482c9e2c2d0017..83607d2c46a0f4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -219,6 +219,29 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
+    ArrayAttr targets = m.getTargetsAttr();
+    if (chipset == "infer") {
+      if (!targets) {
+        emitError(UnknownLoc::get(ctx),
+                  "ROCDLTargetAttr is empty on GPU module");
+        return signalPassFailure();
+      }
+      if (targets.size() != 1) {
+        emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
+                                        "more than one gpu-arch on GPU module");
+        return signalPassFailure();
+      }
+      const ROCDL::ROCDLTargetAttr targetAttr =
+          mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
+      chipset = targetAttr.getChip().str();
+    }
+
+    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset)) {
+      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      return signalPassFailure();
+    }
+
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(
         LLVM::LLVMDialect::getDataLayoutAttrName());
     if (!llvmDataLayout) {
@@ -231,12 +254,6 @@ struct LowerGpuOpsToROCDLOpsPass
                     UnitAttr::get(ctx));
     }
 
-    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-    if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-      return signalPassFailure();
-    }
-
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
index 771f3185904bb8..a338d35525eba7 100644
--- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
index 4fc19b8e93646c..b1291e07c060b5 100644
--- a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
+++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='chipset=gfx900' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
 
 gpu.module @kernel {
   // NVVM-LABEL:  llvm.func @private
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
index e7c742067b4eb5..3c3082c473896e 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
index 33cdc3348e5137..d17214d1f2299b 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
index 793df7380d78bd..ab98be59a2c87e 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' -verify-diagnostics
 
 gpu.module @kernel {
 // expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 1b904fa142bad3..3e3b43c6d4f493 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=HIP' -split-input-file | FileCheck %s
 
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
index 870f5c5016ecef..fa01801972d6a4 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=OpenCL' | FileCheck %s
 
 gpu.module @test_module {
   // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 4 : i32}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 0d3e9f4ea2bf39..ca827e9acffe3b 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: @test_module
 // CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
diff --git a/mlir/test/Conversion/GPUToROCDL/memref.mlir b/mlir/test/Conversion/GPUToROCDL/memref.mlir
index e645481c892308..debf899dd68742 100644
--- a/mlir/test/Conversion/GPUToROCDL/memref.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/memref.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s \
-// RUN:   -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \
+// RUN:   -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=true' \
 // RUN:   -split-input-file \
 // RUN: | FileCheck %s --check-prefix=BARE
 
diff --git a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
index 3c8f3b1d0cbf4b..edb75ee81224ef 100644
--- a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
+++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir
index d5e6e3757540b2..e8feeaa69c2907 100644
--- a/mlir/test/Integration/GPU/ROCM/printf.mlir
+++ b/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
index d49d3957abbe96..d20f71d162800c 100644
--- a/mlir/test/Integration/GPU/ROCM/two-modules.mlir
+++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
index 986d8239427e3c..0ac391cd5f8e3b 100644
--- a/mlir/test/Integration/GPU/ROCM/vecadd.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}))' \
 // RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
index 575d967dcc9a23..417f67e64669ed 100644
--- a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

>From 693afaa9fbb5638e45d059613450ca8a054004a3 Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav at amd.com>
Date: Wed, 2 Oct 2024 13:07:55 +0000
Subject: [PATCH 2/3] Add test for reading attached target

---
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  9 +++---
 .../GPUToROCDL/gpu-to-rocdl-infer-target.mlir | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 83607d2c46a0f4..d1adcc4c8e64a4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -222,13 +222,12 @@ struct LowerGpuOpsToROCDLOpsPass
     ArrayAttr targets = m.getTargetsAttr();
     if (chipset == "infer") {
       if (!targets) {
-        emitError(UnknownLoc::get(ctx),
-                  "ROCDLTargetAttr is empty on GPU module");
+        m->emitError("ROCDLTargetAttr is empty on GPU module");
         return signalPassFailure();
       }
       if (targets.size() != 1) {
-        emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
-                                        "more than one gpu-arch on GPU module");
+        m->emitError("ROCDLTargetAttrs has more specified more than one "
+                     "gpu-arch on GPU module");
         return signalPassFailure();
       }
       const ROCDL::ROCDLTargetAttr targetAttr =
@@ -238,7 +237,7 @@ struct LowerGpuOpsToROCDLOpsPass
 
     FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
     if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      m->emitError("Invalid chipset name: " + chipset);
       return signalPassFailure();
     }
 
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
new file mode 100644
index 00000000000000..83e9470ec1b532
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics | FileCheck --check-prefix=CHECK_TARGET %s
+
+// CHECK_TARGET: @test_module [#rocdl.target<O = 3, chip = "gfx90a">]  attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"} {
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a">] {
+  // CHECK_TARGET-LABEL: @kernel_func
+  // CHECK_TARGET: attributes
+  // CHECK_TARGET: gpu.kernel
+  // CHECK_TARGET: rocdl.kernel
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error at below {{ROCDLTargetAttr is empty on GPU module}}
+gpu.module @test_module {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error at below {{Invalid chipset name: gfx90a,gfx900}}
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a,gfx900">] {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}

>From 4c18e8ad13830d4ef5773fb3f3c29bd56928d12c Mon Sep 17 00:00:00 2001
From: Umang Yadav <29876643+umangyadav at users.noreply.github.com>
Date: Fri, 4 Oct 2024 10:11:01 -0400
Subject: [PATCH 3/3] Address review comments from fabian and update error
 strings in the tests

---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 13 ++++++-------
 .../GPUToROCDL/gpu-to-rocdl-infer-target.mlir       |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d1adcc4c8e64a4..bdc117d44ba603 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -219,25 +219,24 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
-    ArrayAttr targets = m.getTargetsAttr();
     if (chipset == "infer") {
+      ArrayAttr targets = m.getTargetsAttr();
       if (!targets) {
-        m->emitError("ROCDLTargetAttr is empty on GPU module");
+        m->emitError("there are no target attributes to infer");
         return signalPassFailure();
       }
       if (targets.size() != 1) {
-        m->emitError("ROCDLTargetAttrs has more specified more than one "
-                     "gpu-arch on GPU module");
+        m->emitError("expected a single target attribute");
         return signalPassFailure();
       }
-      const ROCDL::ROCDLTargetAttr targetAttr =
-          mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
+      ROCDL::ROCDLTargetAttr targetAttr =
+          dyn_cast<ROCDL::ROCDLTargetAttr>(targets[0]);
       chipset = targetAttr.getChip().str();
     }
 
     FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
     if (failed(maybeChipset)) {
-      m->emitError("Invalid chipset name: " + chipset);
+      m->emitError("invalid chipset name: " + chipset);
       return signalPassFailure();
     }
 
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
index 83e9470ec1b532..4ef6fd004b13c5 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
@@ -13,7 +13,7 @@ gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a">] {
 
 // -----
 
-// expected-error at below {{ROCDLTargetAttr is empty on GPU module}}
+// expected-error at below {{there are no target attributes to infer}}
 gpu.module @test_module {
   gpu.func @kernel_func() kernel {
     gpu.return
@@ -22,7 +22,7 @@ gpu.module @test_module {
 
 // -----
 
-// expected-error at below {{Invalid chipset name: gfx90a,gfx900}}
+// expected-error at below {{invalid chipset name: gfx90a,gfx900}}
 gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a,gfx900">] {
   gpu.func @kernel_func() kernel {
     gpu.return