[Mlir-commits] [mlir] GPU known subgroup size (PR #112732)

Thu Oct 17 08:37:24 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: Finlay (FMarno)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/112732.diff


4 Files Affected:

- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUBase.td (+2-1) 
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+8-1) 
- (modified) mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp (+39-8) 
- (modified) mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir (+4-6) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 860f8933672038..fb9df5067a31b0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -64,7 +64,8 @@ def GPU_Dialect : Dialect {
 
   let discardableAttrs = (ins
     "::mlir::DenseI32ArrayAttr":$known_block_size,
-    "::mlir::DenseI32ArrayAttr":$known_grid_size
+    "::mlir::DenseI32ArrayAttr":$known_grid_size,
+    "::mlir::IntegerAttr" : $known_subgroup_size
   );
 
   let dependentDialects = ["arith::ArithDialect"];
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 6098eb34d04d52..d4779d1b47a42d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -388,6 +388,12 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
     the risk that they will de discarded.
 
+    A function may optionally be annotated with the subgroup size that will be
+    used when it is launched using the `known_subgroup_size` attribute. If set,
+    this attribute is a single positive integer (i.e. > 0). Launching a function
+    with this annotation, using a subgroup size other than specified is
+    undefined behaviour.
+
     Syntax:
 
     ```
@@ -431,7 +437,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
                        OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
                        OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
                        GPU_OptionalDimSizeHintAttr:$known_block_size,
-                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
+                       GPU_OptionalDimSizeHintAttr:$known_grid_size,
+                       OptionalAttr<I32Attr>:$known_subgroup_size);
   let regions = (region AnyRegion:$body);
 
   let skipDefaultBuilders = 1;
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 544f1f4a4f6a79..2a5e5de2357641 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -271,24 +271,55 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
                          typeMangling.value());
   }
 
+  static std::optional<uint32_t>
+  getIntelReqdSubGroupSize(FunctionOpInterface func) {
+    constexpr llvm::StringLiteral discardableIntelReqdSubgroupSize =
+        "llvm.intel_reqd_sub_group_size";
+    IntegerAttr reqdSubgroupSizeAttr = llvm::cast_if_present<IntegerAttr>(
+        func->getAttr(discardableIntelReqdSubgroupSize));
+    if (!reqdSubgroupSizeAttr)
+      return {};
+
+    return reqdSubgroupSizeAttr.getInt();
+  }
+
   /// Get the subgroup size from the target or return a default.
-  static int getSubgroupSize(Operation *op) {
-    return spirv::lookupTargetEnvOrDefault(op)
-        .getResourceLimits()
-        .getSubgroupSize();
+  static std::optional<uint32_t>
+  getKnownSubgroupSize(FunctionOpInterface func) {
+    IntegerAttr knownSubgroupSizeAttr =
+        mlir::gpu::GPUDialect::KnownSubgroupSizeAttrHelper(func->getContext())
+            .getAttr(func);
+    if (!knownSubgroupSizeAttr)
+      return {};
+
+    return knownSubgroupSizeAttr.getInt();
   }
 
-  static bool hasValidWidth(gpu::ShuffleOp op) {
+  static std::optional<uint32_t> getSubgroupSize(Operation *op) {
+    FunctionOpInterface func = op->getParentOfType<FunctionOpInterface>();
+    if (!func)
+      return {};
+    auto knownSubgroupSize = getKnownSubgroupSize(func);
+    if (knownSubgroupSize)
+      return knownSubgroupSize;
+    return getIntelReqdSubGroupSize(func);
+  }
+
+  static bool hasValidWidth(gpu::ShuffleOp op, uint32_t subgroupSize) {
     llvm::APInt val;
     Value width = op.getWidth();
-    return matchPattern(width, m_ConstantInt(&val)) &&
-           val == getSubgroupSize(op);
+    return matchPattern(width, m_ConstantInt(&val)) && val == subgroupSize;
   }
 
   LogicalResult
   matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
-    if (!hasValidWidth(op))
+    auto maybeSubgroupSize = getSubgroupSize(op);
+    if (!maybeSubgroupSize)
+      return rewriter.notifyMatchFailure(
+          op, "subgroup size not specified. Should be specified with "
+              "known_subgroup_size.");
+    if (!hasValidWidth(op, maybeSubgroupSize.value()))
       return rewriter.notifyMatchFailure(
           op, "shuffle width and subgroup size mismatch");
 
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 910105ddf69586..467d15e5c2ef2b 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -260,7 +260,7 @@ gpu.module @shuffles {
   func.func @gpu_shuffles(%val0: i32, %id: i32,
                           %val1: i64, %mask: i32,
                           %val2: f32, %delta_up: i32,
-                          %val3: f64, %delta_down: i32) {
+                          %val3: f64, %delta_down: i32) attributes { llvm.intel_reqd_sub_group_size = 32 : i32 } {
     %width = arith.constant 32 : i32
     // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) {
     // CHECK-SAME-DAG:  no_unwind
@@ -302,9 +302,7 @@ gpu.module @shuffles {
 
 // Check `gpu.shuffle` conversion with explicit subgroup size.
 
-gpu.module @shuffles attributes {
-  spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Kernel, Addresses, GroupNonUniformShuffle, Int64], []>, #spirv.resource_limits<subgroup_size = 16>>
-} {
+gpu.module @shuffles {
   // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
   // CHECK-SAME-DAG:  no_unwind
   // CHECK-SAME-DAG:  convergent
@@ -352,7 +350,7 @@ gpu.module @shuffles attributes {
   // CHECK-SAME:              (%[[I8_VAL:.*]]: i8, %[[I16_VAL:.*]]: i16,
   // CHECK-SAME:               %[[I32_VAL:.*]]: i32, %[[I64_VAL:.*]]: i64,
   // CHECK-SAME:               %[[F16_VAL:.*]]: f16, %[[F32_VAL:.*]]: f32,
-  // CHECK-SAME:               %[[F64_VAL:.*]]: f64,  %[[OFFSET:.*]]: i32) {
+  // CHECK-SAME:               %[[F64_VAL:.*]]: f64,  %[[OFFSET:.*]]: i32)
   func.func @gpu_shuffles(%i8_val: i8,
                           %i16_val: i16,
                           %i32_val: i32,
@@ -360,7 +358,7 @@ gpu.module @shuffles attributes {
                           %f16_val: f16,
                           %f32_val: f32,
                           %f64_val: f64,
-                          %offset: i32) {
+                          %offset: i32) attributes {gpu.known_subgroup_size = 16 : i32} {
     %width = arith.constant 16 : i32
     // CHECK:         llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[I8_VAL]], %[[OFFSET]])
     // CHECK:         llvm.mlir.constant(true) : i1

``````````

</details>


https://github.com/llvm/llvm-project/pull/112732