[Mlir-commits] [mlir] [mlir][AMDGPU] Set uniform-work-group-size=true by default (PR #79077)

Tue Jan 23 12:43:42 PST 2024

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/79077

>From 1eedc8a0d6c929f4ec174360754752cc23f374b6 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 22 Jan 2024 23:39:41 +0000
Subject: [PATCH] [mlir][AMDGPU] Set uniform-work-group-size=true by default

GPU kernels generated via typical MLIR mechanisms make the assumption
that all workgroups are of uniform size, and so, as in OpenMP, it is
appropriate to set the "uniform-work-group-size"="true" attribute on
these functions by default. This commit makes that choiec.

In the event it is needed,t his commit adds
`rocdl.uniform_wrok_group_size` as an attribute to be set on LLVM
functions that can be used to override the default.
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  6 ++++++
 .../ROCDL/ROCDLToLLVMIRTranslation.cpp        | 20 ++++++++++++++++++-
 mlir/test/Target/LLVMIR/rocdl.mlir            |  9 ++++++++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 516a984399ff815..b8d9a692a4d6ed7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -37,6 +37,12 @@ def ROCDL_Dialect : Dialect {
     static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
       return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
     }
+    /// MLIR's gpu-related infrastructure effectively assume uniform workgroup
+    /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
+    /// It is provided here to allow overriding this assumption.
+    static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
+    }
 
     /// The address space value that represents global memory.
     static constexpr unsigned kGlobalMemoryAddressSpace = 1;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 0cbb3da79d151e0..7f154f3b07c5683 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -100,6 +100,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
 
+      // MLIR's GPU kernel APIs all assume and produce uniformly-sized
+      // workgroups, so the lowering of the `rocdl.kernel` marker encodes this
+      // assumption. This assumption may be overridden by setting
+      // `rocdl.uniform_work_group_size` on a given function.
+      if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
+        llvmFunc->addFnAttr("uniform-work-group-size", "true");
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
@@ -134,7 +140,19 @@ class ROCDLDialectLLVMIRTranslationInterface
       llvmAttrValue.append(value.getValue());
       llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
     }
-
+    if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return failure();
+      auto value = dyn_cast<BoolAttr>(attribute.getValue());
+      if (!value)
+        return failure();
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvmFunc->addFnAttr("uniform-work-group-size",
+                          value.getValue() ? "true" : "false");
+    }
     // Set reqd_work_group_size metadata
     if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
         attribute.getName()) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae2304..0041b959a5a1d26 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
   llvm.return
 }
 
+llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
+  // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
+  // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
+  llvm.return
+}
+
 llvm.func @rocdl.lane_id() -> i32 {
   // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -489,8 +495,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
 // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}