[Mlir-commits] [mlir] [mlir][gpu] Add optional attributes of kernelModule and kernelFunc for outlining kernels. (PR #118861)

Thu Dec 5 22:20:00 PST 2024

https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/118861

>From 524fc1393f649657a83ec1c7a3aa02491b779c1f Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 4 Dec 2024 16:50:51 -0800
Subject: [PATCH 1/9] Add optional attribute outline_module to gpu.launch

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td          | 12 +++++++++++-
 mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 13 ++++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index d08e7ceb9e6c69..1a393cf3daba8c 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -803,7 +803,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                Optional<Index>:$clusterSizeX,
                Optional<Index>:$clusterSizeY,
                Optional<Index>:$clusterSizeZ,
-               Optional<I32>:$dynamicSharedMemorySize)>,
+               Optional<I32>:$dynamicSharedMemorySize,
+               OptionalAttr<SymbolRefAttr>:$outlineModule)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "GPU kernel launch operation";
 
@@ -837,6 +838,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     -   a variadic number of Workgroup memory attributions.
     -   a variadic number of Private memory attributions.
 
+    The `outline_module` attribute is optional and specifies a module in which 
+    the kernel should be outlined. When this attribute is present, the kernel is
+    outlined into the specified module instead of the default behavior.
+
     Syntax:
 
     ```
@@ -1030,6 +1035,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Checks if the outline_module attribute is present.
+    bool hasOutlineModule() {
+      return getOutlineModule().has_value();
+    }
   }];
 
   let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 5f6556d915f41c..65b63e0f5b71db 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -364,9 +364,16 @@ class GpuKernelOutliningPass
       Block::iterator insertPt(func->getNextNode());
       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         SetVector<Value> operands;
-        std::string kernelFnName =
-            Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
-                .str();
+        std::string kernelFnName;
+        if (auto outlineModuleAttr = op->getAttrOfType<SymbolRefAttr>("outline_module")) {
+          kernelFnName = outlineModuleAttr.getRootReference().str();
+          llvm::errs() << "outlined module name = " << kernelFnName << "\n";
+        } else {
+          kernelFnName =
+              Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
+                  .str();
+          llvm::errs() << "original module name = " << kernelFnName << "\n";
+        }
 
         gpu::GPUFuncOp outlinedFunc =
             outlineKernelFuncImpl(op, kernelFnName, operands);

>From 303c7f95669b5d29d9a208626c2baaadb3638e94 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 10:37:00 -0800
Subject: [PATCH 2/9] Add optional attributes kernelFunc and kernelModule to
 specify the kernel function name or kernel module name.

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 19 ++++++----
 .../GPU/Transforms/KernelOutlining.cpp        | 36 ++++++++++++++-----
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 1a393cf3daba8c..94d3872a45e2f2 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -804,7 +804,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                Optional<Index>:$clusterSizeY,
                Optional<Index>:$clusterSizeZ,
                Optional<I32>:$dynamicSharedMemorySize,
-               OptionalAttr<SymbolRefAttr>:$outlineModule)>,
+               OptionalAttr<SymbolRefAttr>:$kernelFunc,
+               OptionalAttr<SymbolRefAttr>:$kernelModule)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "GPU kernel launch operation";
 
@@ -838,9 +839,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     -   a variadic number of Workgroup memory attributions.
     -   a variadic number of Private memory attributions.
 
-    The `outline_module` attribute is optional and specifies a module in which 
-    the kernel should be outlined. When this attribute is present, the kernel is
-    outlined into the specified module instead of the default behavior.
+    The `kernelFunc` and `kernelModule` attributes are optional and specifies the kernel name and a module in whichthe kernel should be outlined. 
+
 
     Syntax:
 
@@ -1036,9 +1036,14 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       return "workgroup_attributions";
     }
 
-    /// Checks if the outline_module attribute is present.
-    bool hasOutlineModule() {
-      return getOutlineModule().has_value();
+    /// Checks if the kernel func name attribute is present.
+    bool hasKernelFuncName() {
+      return getKernelFunc().has_value();
+    }
+
+    /// Checks if the kernel module name attribute is present.
+    bool hasKernelModuleName() {
+      return getKernelModule().has_value();
     }
   }];
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 65b63e0f5b71db..6028cb58d6842c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -365,14 +365,14 @@ class GpuKernelOutliningPass
       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         SetVector<Value> operands;
         std::string kernelFnName;
-        if (auto outlineModuleAttr = op->getAttrOfType<SymbolRefAttr>("outline_module")) {
-          kernelFnName = outlineModuleAttr.getRootReference().str();
-          llvm::errs() << "outlined module name = " << kernelFnName << "\n";
+        if (op.hasKernelFuncName()) {
+          kernelFnName = op->getAttrOfType<mlir::SymbolRefAttr>("kernelFunc").getRootReference().str();
+          llvm::errs() << "use provided kernel func name = " << kernelFnName << "\n";
         } else {
           kernelFnName =
               Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
                   .str();
-          llvm::errs() << "original module name = " << kernelFnName << "\n";
+          llvm::errs() << "use default kernel func name = " << kernelFnName << "\n";
         }
 
         gpu::GPUFuncOp outlinedFunc =
@@ -381,7 +381,7 @@ class GpuKernelOutliningPass
         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on
         // insertion into the parent module.
-        auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
+        auto kernelModule = createKernelModule(op, outlinedFunc, symbolTable);
         symbolTable.insert(kernelModule, insertPt);
 
         // Potentially changes signature, pulling in constants.
@@ -402,7 +402,7 @@ class GpuKernelOutliningPass
 
 private:
   /// Returns a gpu.module containing kernelFunc and all callees (recursive).
-  gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
+  gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp, gpu::GPUFuncOp kernelFunc,
                                       const SymbolTable &parentSymbolTable) {
     // TODO: This code cannot use an OpBuilder because it must be inserted into
     // a SymbolTable by the caller. SymbolTable needs to be refactored to
@@ -410,8 +410,26 @@ class GpuKernelOutliningPass
     // and then this needs to use the OpBuilder.
     auto *context = getOperation().getContext();
     OpBuilder builder(context);
-    auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
-                                                         kernelFunc.getName());
+    std::string kernelModuleName;
+    if (gpuLaunchOp.hasKernelModuleName()) {
+      kernelModuleName = gpuLaunchOp->getAttrOfType<mlir::SymbolRefAttr>("kernelModule").getRootReference().str();
+      llvm::errs() << "use provided kernel module name = " << kernelModuleName << "\n";
+    } else {
+      kernelModuleName = kernelFunc.getName();
+      llvm::errs() << "use default kernel module name = " << kernelModuleName << "\n";
+    }
+
+    gpu::GPUModuleOp kernelModule;
+    // Check if the module already exists in the symbol table
+    if (auto existingModule = parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
+      llvm::errs() << "Reusing existing kernel module: " << kernelModuleName << "\n";
+      kernelModule = existingModule;
+    } else {
+      // If not found, create a new GPU module
+      llvm::errs() << "Creating new kernel module: " << kernelModuleName << "\n";
+      kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
+                                                           kernelModuleName);
+    }
 
     // If a valid data layout spec was provided, attach it to the kernel module.
     // Otherwise, the default data layout will be used.
@@ -439,6 +457,8 @@ class GpuKernelOutliningPass
       }
     }
 
+    //llvm::errs() << "kernelModule:\n" << kernelModule << "\n";
+
     return kernelModule;
   }
 

>From c78e8360c93747c4a8639fcf5a8d37219d96ea9c Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 11:27:57 -0800
Subject: [PATCH 3/9] formatting

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  3 +-
 .../GPU/Transforms/KernelOutlining.cpp        | 28 +++++++++----------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 94d3872a45e2f2..71d14f5f7774b9 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -839,7 +839,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     -   a variadic number of Workgroup memory attributions.
     -   a variadic number of Private memory attributions.
 
-    The `kernelFunc` and `kernelModule` attributes are optional and specifies the kernel name and a module in whichthe kernel should be outlined. 
+    The `kernelFunc` and `kernelModule` attributes are optional and specifies
+    the kernel name and a module in whichthe kernel should be outlined. 
 
 
     Syntax:
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 6028cb58d6842c..872200566bb315 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -366,13 +366,14 @@ class GpuKernelOutliningPass
         SetVector<Value> operands;
         std::string kernelFnName;
         if (op.hasKernelFuncName()) {
-          kernelFnName = op->getAttrOfType<mlir::SymbolRefAttr>("kernelFunc").getRootReference().str();
-          llvm::errs() << "use provided kernel func name = " << kernelFnName << "\n";
+          kernelFnName = op->getAttrOfType<mlir::SymbolRefAttr>("kernelFunc")
+                             .getRootReference()
+                             .str();
         } else {
           kernelFnName =
-              Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
+              Twine(op->getParentOfType<SymbolOpInterface>().getName(),
+                    "_kernel")
                   .str();
-          llvm::errs() << "use default kernel func name = " << kernelFnName << "\n";
         }
 
         gpu::GPUFuncOp outlinedFunc =
@@ -402,7 +403,8 @@ class GpuKernelOutliningPass
 
 private:
   /// Returns a gpu.module containing kernelFunc and all callees (recursive).
-  gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp, gpu::GPUFuncOp kernelFunc,
+  gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp,
+                                      gpu::GPUFuncOp kernelFunc,
                                       const SymbolTable &parentSymbolTable) {
     // TODO: This code cannot use an OpBuilder because it must be inserted into
     // a SymbolTable by the caller. SymbolTable needs to be refactored to
@@ -412,23 +414,23 @@ class GpuKernelOutliningPass
     OpBuilder builder(context);
     std::string kernelModuleName;
     if (gpuLaunchOp.hasKernelModuleName()) {
-      kernelModuleName = gpuLaunchOp->getAttrOfType<mlir::SymbolRefAttr>("kernelModule").getRootReference().str();
-      llvm::errs() << "use provided kernel module name = " << kernelModuleName << "\n";
+      kernelModuleName =
+          gpuLaunchOp->getAttrOfType<mlir::SymbolRefAttr>("kernelModule")
+              .getRootReference()
+              .str();
     } else {
       kernelModuleName = kernelFunc.getName();
-      llvm::errs() << "use default kernel module name = " << kernelModuleName << "\n";
     }
 
     gpu::GPUModuleOp kernelModule;
     // Check if the module already exists in the symbol table
-    if (auto existingModule = parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
-      llvm::errs() << "Reusing existing kernel module: " << kernelModuleName << "\n";
+    if (auto existingModule =
+            parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
       kernelModule = existingModule;
     } else {
       // If not found, create a new GPU module
-      llvm::errs() << "Creating new kernel module: " << kernelModuleName << "\n";
       kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
-                                                           kernelModuleName);
+                                                      kernelModuleName);
     }
 
     // If a valid data layout spec was provided, attach it to the kernel module.
@@ -457,8 +459,6 @@ class GpuKernelOutliningPass
       }
     }
 
-    //llvm::errs() << "kernelModule:\n" << kernelModule << "\n";
-
     return kernelModule;
   }
 

>From 0d422ef5922852be734ecb86fb30d648fc2f152b Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 14:45:59 -0800
Subject: [PATCH 4/9] address review feedbacks

---
 mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 872200566bb315..da011be1c7eb79 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -365,10 +365,8 @@ class GpuKernelOutliningPass
       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         SetVector<Value> operands;
         std::string kernelFnName;
-        if (op.hasKernelFuncName()) {
-          kernelFnName = op->getAttrOfType<mlir::SymbolRefAttr>("kernelFunc")
-                             .getRootReference()
-                             .str();
+        if (op.getKernelFunc()) {
+          kernelFnName = op.getKernelFunc()->getRootReference().str();
         } else {
           kernelFnName =
               Twine(op->getParentOfType<SymbolOpInterface>().getName(),
@@ -413,11 +411,9 @@ class GpuKernelOutliningPass
     auto *context = getOperation().getContext();
     OpBuilder builder(context);
     std::string kernelModuleName;
-    if (gpuLaunchOp.hasKernelModuleName()) {
+    if (gpuLaunchOp.getKernelModule()) {
       kernelModuleName =
-          gpuLaunchOp->getAttrOfType<mlir::SymbolRefAttr>("kernelModule")
-              .getRootReference()
-              .str();
+          gpuLaunchOp.getKernelModule()->getRootReference().str();
     } else {
       kernelModuleName = kernelFunc.getName();
     }

>From d31b13935d3512449ad906e2110ac208a0fa1a3e Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 14:46:25 -0800
Subject: [PATCH 5/9] address review feedbacks

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 71d14f5f7774b9..e700e478f2c089 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -842,7 +842,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     The `kernelFunc` and `kernelModule` attributes are optional and specifies
     the kernel name and a module in whichthe kernel should be outlined. 
 
-
     Syntax:
 
     ```
@@ -1036,16 +1035,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
-
-    /// Checks if the kernel func name attribute is present.
-    bool hasKernelFuncName() {
-      return getKernelFunc().has_value();
-    }
-
-    /// Checks if the kernel module name attribute is present.
-    bool hasKernelModuleName() {
-      return getKernelModule().has_value();
-    }
   }];
 
   let hasCanonicalizer = 1;

>From 8b5827f401f0dbde4dd79f3384af01593be59239 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 20:13:32 -0800
Subject: [PATCH 6/9] Move the check of existing kernel module inside the case
 where kernel module name is specified.

---
 mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index da011be1c7eb79..ff8e214e5c10fe 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -411,19 +411,20 @@ class GpuKernelOutliningPass
     auto *context = getOperation().getContext();
     OpBuilder builder(context);
     std::string kernelModuleName;
+    gpu::GPUModuleOp kernelModule = nullptr;
     if (gpuLaunchOp.getKernelModule()) {
       kernelModuleName =
           gpuLaunchOp.getKernelModule()->getRootReference().str();
+      if (auto existingModule =
+              parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
+        kernelModule = existingModule;
+      }
     } else {
       kernelModuleName = kernelFunc.getName();
     }
 
-    gpu::GPUModuleOp kernelModule;
     // Check if the module already exists in the symbol table
-    if (auto existingModule =
-            parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
-      kernelModule = existingModule;
-    } else {
+    if (!kernelModule) {
       // If not found, create a new GPU module
       kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
                                                       kernelModuleName);

>From 9110630ab645bea0d7a3d5abbe548ec54188bf94 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 20:59:10 -0800
Subject: [PATCH 7/9] add test

---
 mlir/test/Dialect/GPU/outlining.mlir | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 6e682b26f6c95c..566547123ba698 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -508,3 +508,24 @@ func.func @launch_cluster() {
 // CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
 
+// -----
+// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch
+// CHECK-LABEL: func.func @testKernelAttributes()
+// CHECK: gpu.launch_func  @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
+// CHECK: gpu.module @test_module
+// CHECK: gpu.func @test_kernel_func()
+func.func @testKernelAttributes() {
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 32 : index
+  %bDimY = arith.constant 16 : index
+  %bDimZ = arith.constant 8 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+    "some_op"(%bx, %tx) : (index, index) -> ()
+    gpu.terminator
+  } {kernelModule = @test_module, kernelFunc = @test_kernel_func}
+  return
+}

>From 4ff90639a5bd19094f06e22d98d608d3ef86c8ee Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 22:17:21 -0800
Subject: [PATCH 8/9] More changes based on review feedback. More tests.

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |   2 +-
 .../GPU/Transforms/KernelOutlining.cpp        |   8 +-
 mlir/test/Dialect/GPU/outlining.mlir          | 102 ++++++++++++++++++
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index e700e478f2c089..42a017db300af6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -840,7 +840,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     -   a variadic number of Private memory attributions.
 
     The `kernelFunc` and `kernelModule` attributes are optional and specifies
-    the kernel name and a module in whichthe kernel should be outlined. 
+    the kernel name and a module in which the kernel should be outlined. 
 
     Syntax:
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index ff8e214e5c10fe..ba0c80c50211e3 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -411,14 +411,12 @@ class GpuKernelOutliningPass
     auto *context = getOperation().getContext();
     OpBuilder builder(context);
     std::string kernelModuleName;
-    gpu::GPUModuleOp kernelModule = nullptr;
+    gpu::GPUModuleOp kernelModule;
     if (gpuLaunchOp.getKernelModule()) {
       kernelModuleName =
           gpuLaunchOp.getKernelModule()->getRootReference().str();
-      if (auto existingModule =
-              parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName)) {
-        kernelModule = existingModule;
-      }
+      kernelModule =
+          parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName);
     } else {
       kernelModuleName = kernelFunc.getName();
     }
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 566547123ba698..f1071814b8eda7 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -529,3 +529,105 @@ func.func @testKernelAttributes() {
   } {kernelModule = @test_module, kernelFunc = @test_kernel_func}
   return
 }
+
+// -----
+// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists.
+
+// CHECK-LABEL: gpu.module @existing_module
+// CHECK: gpu.func @test_kernel_func()
+// CHECK: gpu.func @test_kernel_func_0()
+// CHECK-NOT: gpu.module @testExistingModule_kernel
+// CHECK-NOT: gpu.func @testExistingModule_kernel()
+// CHECK: func.func @testExistingModule()
+// CHECK: gpu.launch_func  @existing_module::@test_kernel_func_0 blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
+
+gpu.module @existing_module {
+  gpu.func @test_kernel_func() {
+    gpu.return
+  }
+}
+
+func.func @testExistingModule() {
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 32 : index
+  %bDimY = arith.constant 16 : index
+  %bDimZ = arith.constant 8 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+    "some_op"(%bx, %tx) : (index, index) -> ()
+    gpu.terminator
+  } {kernelModule = @existing_module, kernelFunc = @test_kernel_func}
+  return
+}
+
+// -----
+// This test tests the optional attribute kernelModule for gpu.launch.
+// CHECK-LABEL: func.func @testKernelModuleOnly()
+// CHECK: gpu.launch_func  @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
+// CHECK: gpu.module @test_module
+// CHECK: gpu.func @testKernelModuleOnly_kernel()
+func.func @testKernelModuleOnly() {
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 32 : index
+  %bDimY = arith.constant 16 : index
+  %bDimZ = arith.constant 8 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+    "some_op"(%bx, %tx) : (index, index) -> ()
+    gpu.terminator
+  } {kernelModule = @test_module}
+  return
+}
+
+// -----
+// This test tests the optional attribute kernelFunc for gpu.launch.
+// CHECK-LABEL: func.func @testKernelFuncOnly()
+// CHECK: gpu.launch_func  @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
+
+// CHECK: gpu.module @test_kernel_func
+// CHECK: gpu.func @test_kernel_func()
+func.func @testKernelFuncOnly() {
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 32 : index
+  %bDimY = arith.constant 16 : index
+  %bDimZ = arith.constant 8 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+    "some_op"(%bx, %tx) : (index, index) -> ()
+    gpu.terminator
+  } {kernelFunc = @test_kernel_func}
+  return
+}
+
+
+// -----
+// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified.
+// CHECK-LABEL: func.func @testNoAttributes()
+// CHECK: gpu.launch_func  @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
+
+// CHECK: gpu.module @testNoAttributes_kernel
+// CHECK: gpu.func @testNoAttributes_kernel()
+func.func @testNoAttributes() {
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 32 : index
+  %bDimY = arith.constant 16 : index
+  %bDimZ = arith.constant 8 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+    "some_op"(%bx, %tx) : (index, index) -> ()
+    gpu.terminator
+  }
+  return
+}

>From 26b19ff0cdb01fc414c2d81ac9c7142bd9da3b34 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 5 Dec 2024 22:19:41 -0800
Subject: [PATCH 9/9] deleting an extra empty line

---
 mlir/test/Dialect/GPU/outlining.mlir | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index f1071814b8eda7..d48fa054432d1a 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -608,7 +608,6 @@ func.func @testKernelFuncOnly() {
   return
 }
 
-
 // -----
 // This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified.
 // CHECK-LABEL: func.func @testNoAttributes()