[Mlir-commits] [mlir] [mlir][AMDGPU] Set uniform-work-group-size=true by default (PR #79077)

Mon Feb 19 09:34:34 PST 2024

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/79077

>From 1eedc8a0d6c929f4ec174360754752cc23f374b6 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 22 Jan 2024 23:39:41 +0000
Subject: [PATCH 1/2] [mlir][AMDGPU] Set uniform-work-group-size=true by
 default

GPU kernels generated via typical MLIR mechanisms make the assumption
that all workgroups are of uniform size, and so, as in OpenMP, it is
appropriate to set the "uniform-work-group-size"="true" attribute on
these functions by default. This commit makes that choiec.

In the event it is needed,t his commit adds
`rocdl.uniform_wrok_group_size` as an attribute to be set on LLVM
functions that can be used to override the default.
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  6 ++++++
 .../ROCDL/ROCDLToLLVMIRTranslation.cpp        | 20 ++++++++++++++++++-
 mlir/test/Target/LLVMIR/rocdl.mlir            |  9 ++++++++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 516a984399ff81..b8d9a692a4d6ed 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -37,6 +37,12 @@ def ROCDL_Dialect : Dialect {
     static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
       return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
     }
+    /// MLIR's gpu-related infrastructure effectively assume uniform workgroup
+    /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
+    /// It is provided here to allow overriding this assumption.
+    static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
+    }
 
     /// The address space value that represents global memory.
     static constexpr unsigned kGlobalMemoryAddressSpace = 1;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 0cbb3da79d151e..7f154f3b07c568 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -100,6 +100,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
 
+      // MLIR's GPU kernel APIs all assume and produce uniformly-sized
+      // workgroups, so the lowering of the `rocdl.kernel` marker encodes this
+      // assumption. This assumption may be overridden by setting
+      // `rocdl.uniform_work_group_size` on a given function.
+      if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
+        llvmFunc->addFnAttr("uniform-work-group-size", "true");
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
@@ -134,7 +140,19 @@ class ROCDLDialectLLVMIRTranslationInterface
       llvmAttrValue.append(value.getValue());
       llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
     }
-
+    if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return failure();
+      auto value = dyn_cast<BoolAttr>(attribute.getValue());
+      if (!value)
+        return failure();
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvmFunc->addFnAttr("uniform-work-group-size",
+                          value.getValue() ? "true" : "false");
+    }
     // Set reqd_work_group_size metadata
     if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
         attribute.getName()) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae230..0041b959a5a1d2 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
   llvm.return
 }
 
+llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
+  // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
+  // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
+  llvm.return
+}
+
 llvm.func @rocdl.lane_id() -> i32 {
   // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -489,8 +495,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
 // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

>From 4efa86bca712da519058162fbd59700febaa5a25 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 19 Feb 2024 17:34:21 +0000
Subject: [PATCH 2/2] Add proper failure messages to translation

---
 .../ROCDL/ROCDLToLLVMIRTranslation.cpp        | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 7f154f3b07c568..e50ca1959d0c88 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -87,7 +87,9 @@ class ROCDLDialectLLVMIRTranslationInterface
     if (attribute.getName() == ROCDL::ROCDLDialect::getKernelFuncAttrName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
+      ;
 
       // For GPU kernels,
       // 1. Insert AMDGPU_KERNEL calling convention.
@@ -113,10 +115,12 @@ class ROCDLDialectLLVMIRTranslationInterface
     if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<IntegerAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be an integer");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -129,10 +133,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<StringAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a string");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -144,10 +150,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<BoolAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a boolean");
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
       llvmFunc->addFnAttr("uniform-work-group-size",
@@ -158,10 +166,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a dense i32 array attribute");
       llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);