[Mlir-commits] [mlir] [mlir][gpu] Allow integer attribute as `dynamic_shared_memory_size` p… (PR #71509)

Tue Nov 7 02:34:28 PST 2023

https://github.com/grypp created https://github.com/llvm/llvm-project/pull/71509

…arameter of `gpu.launch`

This PR allows integer attributes as `dynamic_shared_memory_size` parameter of `gpu.launch`. See the example IR below, `200` doesn't have to be SSA value anymore.
```
gpu.launch blocks(..) threads(...)
             dynamic_shared_memory_size 200
```

>From 275969718d63de31e403ee29714971c6ea671357 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Tue, 7 Nov 2023 11:32:08 +0100
Subject: [PATCH] [mlir][gpu] Allow integer attribute as
 `dynamic_shared_memory_size` parameter of `gpu.launch`

This PR allows integer attributes as `dynamic_shared_memory_size` parameter of `gpu.launch`. See the example IR below, `200` doesn't have to be SSA value anymore.
```
gpu.launch blocks(..) threads(...)
             dynamic_shared_memory_size 200
```
---
 mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h |  1 +
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 24 +++++++++++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 34 ++++++++++++++-----
 .../GPU/Transforms/KernelOutlining.cpp        |  2 +-
 mlir/test/Dialect/GPU/outlining.mlir          | 33 ++++++++++++++++++
 5 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 14a1fac5fd255f3..06b1ea95d20339d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -17,6 +17,7 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/DLTI/Traits.h"
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 6375d35f4311295..5bf5cbc5efe628f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -587,7 +587,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
-               Optional<I32>:$dynamicSharedMemorySize)>,
+               Optional<I32>:$dynamicSharedMemorySize,
+               OptionalAttr<SI32Attr>:$dynamicSharedMemorySizeConstant)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "GPU kernel launch operation";
 
@@ -693,7 +694,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       CArg<"Type", "nullptr">:$asyncTokenType,
       CArg<"ValueRange", "{}">:$asyncDependencies,
       CArg<"TypeRange", "{}">:$workgroupAttributions,
-      CArg<"TypeRange", "{}">:$privateAttributions)>
+      CArg<"TypeRange", "{}">:$privateAttributions,
+      CArg<"IntegerAttr", "IntegerAttr()">:$dynamicSharedMemorySizeConstant)>
   ];
 
   let extraClassDeclaration = [{
@@ -728,6 +730,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     /// Returns the keywords used in the custom syntax for this Op.
     static StringRef getWorkgroupKeyword() { return "workgroup"; }
     static StringRef getPrivateKeyword() { return "private"; }
+    static StringRef getDynamicSharedMemorySizeConstantKeyword() { 
+      return "dynamicSharedMemorySizeConstant"; 
+    }
+
+    static int getDynamicSharedMemorySizeDynamicValue() { 
+      return std::numeric_limits<int32_t>::min(); 
+    }
+    /// Returns a value of the dynamic shared memory size. 
+    /// If it is a constant, it builds one
+    mlir::Value getDynamicSharedMemorySizeValue(OpBuilder &b) { 
+      int32_t kDynamic = getDynamicSharedMemorySizeDynamicValue();
+      if (getDynamicSharedMemorySizeConstant().value_or(kDynamic) == kDynamic)
+        return getDynamicSharedMemorySize();
+      return b.create<mlir::arith::ConstantOp>(
+            getLoc(), b.getIntegerType(32),
+            b.getI32IntegerAttr(
+                getDynamicSharedMemorySizeConstant().value()));
+    }
 
     /// Returns the number of buffers located in the workgroup memory.
     unsigned getNumWorkgroupAttributions() {
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 5eb2cadc884e151..269ee7dcaec0e71 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -618,7 +618,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                      Value getBlockSizeZ, Value dynamicSharedMemorySize,
                      Type asyncTokenType, ValueRange asyncDependencies,
                      TypeRange workgroupAttributions,
-                     TypeRange privateAttributions) {
+                     TypeRange privateAttributions,
+                     IntegerAttr dynamicSharedMemorySizeAttr) {
   // Add a WorkGroup attribution attribute. This attribute is required to
   // identify private attributions in the list of block argguments.
   result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -634,7 +635,9 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                       getBlockSizeY, getBlockSizeZ});
   if (dynamicSharedMemorySize)
     result.addOperands(dynamicSharedMemorySize);
-
+  if (dynamicSharedMemorySizeAttr)
+    result.addAttribute(getDynamicSharedMemorySizeConstantKeyword(),
+                        dynamicSharedMemorySizeAttr);
   // Create a kernel body region with kNumConfigRegionAttributes + N memory
   // attributions, where the first kNumConfigRegionAttributes arguments have
   // `index` type and the rest have the same types as the data operands.
@@ -759,6 +762,10 @@ void LaunchOp::print(OpAsmPrinter &p) {
   if (getDynamicSharedMemorySize())
     p << ' ' << getDynamicSharedMemorySizeKeyword() << ' '
       << getDynamicSharedMemorySize();
+  else if (getDynamicSharedMemorySizeConstantAttr()) {
+    p << ' ' << getDynamicSharedMemorySizeKeyword() << ' '
+      << getDynamicSharedMemorySizeConstantAttr().getSInt();
+  }
 
   printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions());
   printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
@@ -768,7 +775,8 @@ void LaunchOp::print(OpAsmPrinter &p) {
   p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
   p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
                               LaunchOp::getOperandSegmentSizeAttr(),
-                              getNumWorkgroupAttributionsAttrName()});
+                              getNumWorkgroupAttributionsAttrName(),
+                              getDynamicSharedMemorySizeConstantKeyword()});
 }
 
 // Parse the size assignment blocks for blocks and threads.  These have the form
@@ -854,12 +862,20 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
   bool hasDynamicSharedMemorySize = false;
   if (!parser.parseOptionalKeyword(
           LaunchOp::getDynamicSharedMemorySizeKeyword())) {
-    hasDynamicSharedMemorySize = true;
-    if (parser.parseOperand(dynamicSharedMemorySize) ||
-        parser.resolveOperand(dynamicSharedMemorySize,
-                              parser.getBuilder().getI32Type(),
-                              result.operands))
-      return failure();
+    IntegerAttr shmemAttr;
+    OptionalParseResult shmemAttrResult = parser.parseOptionalAttribute(
+        shmemAttr, parser.getBuilder().getIntegerType(32, true));
+    if (!shmemAttrResult.has_value()) {
+      hasDynamicSharedMemorySize = true;
+      shmemAttr = parser.getBuilder().getSI32IntegerAttr(
+          getDynamicSharedMemorySizeDynamicValue());
+      if (parser.parseOperand(dynamicSharedMemorySize) ||
+          parser.resolveOperand(dynamicSharedMemorySize,
+                                parser.getBuilder().getI32Type(),
+                                result.operands))
+        return failure();
+    }
+    result.addAttribute(getDynamicSharedMemorySizeConstantKeyword(), shmemAttr);
   }
 
   // Create the region arguments, it has kNumConfigRegionAttributes arguments
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index b1e2f914db4cb9b..3e29fbe8cdfbbc3 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -281,7 +281,7 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
   auto launchFunc = builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(),
-      launchOp.getDynamicSharedMemorySize(), operands,
+      launchOp.getDynamicSharedMemorySizeValue(builder), operands,
       asyncToken ? asyncToken.getType() : nullptr,
       launchOp.getAsyncDependencies());
   launchOp.replaceAllUsesWith(launchFunc);
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 28c121a550100c2..b032a4035230990 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -372,3 +372,36 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
 }
 
 // CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+
+// -----
+
+// CHECK-LABEL: func.func @dynamic_shared_memory(
+// CHECK-SAME: %[[arg0:.+]]: i32
+func.func @dynamic_shared_memory(%shmemSize : i32) {  
+  %c1 = arith.constant 1 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1)
+             threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) 
+             dynamic_shared_memory_size %shmemSize
+  {
+    gpu.terminator
+  }
+  gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1)
+             threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) 
+             dynamic_shared_memory_size 200
+  {
+    gpu.terminator
+  }
+    gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1)
+             threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1)              
+  {
+    gpu.terminator
+  }
+
+
+// CHECK: gpu.launch_func  @dynamic_shared_memory_kernel::@dynamic_shared_memory_kernel blocks in (%{{.+}}, %{{.+}}, %{{.+}}) threads in (%{{.+}}, %{{.+}}, %{{.+}})  dynamic_shared_memory_size %[[arg0]]
+// CHECK: %[[c200:.+]] = arith.constant 200 : i32
+// CHECK: gpu.launch_func  @dynamic_shared_memory_kernel_0::@dynamic_shared_memory_kernel blocks in (%{{.+}}, %{{.+}}, %{{.+}}) threads in (%{{.+}}, %{{.+}}, %{{.+}})  dynamic_shared_memory_size %[[c200]]
+  return
+}
+