[Mlir-commits] [mlir] [mlir][ROCDL] Add subgroup_reduce lowering support for gfx10+ devices (PR #135983)

Wed Apr 16 11:04:20 PDT 2025

https://github.com/Muzammiluddin-Syed-ECE updated https://github.com/llvm/llvm-project/pull/135983

>From 44df8d58a5ec9884dc4003d8e1a79a84397c2df8 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 01:07:47 -0500
Subject: [PATCH 1/2] Adding permlanex16 and rowshare to rocdl mlir dialect

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td    |  4 +++-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td     | 16 ++++++++++++++++
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp   |  6 ++++++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp     | 14 ++++++++++++++
 mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir      |  8 ++++++++
 5 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 108d7237ff703..17c1162170073 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>
+      I32EnumAttrCase<"row_bcast_31", 11>,
+      I32EnumAttrCase<"row_share", 12>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
+    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 900155c274b4d..186a4f53f93cb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
   }];
 }
 
+// PermLaneX16 intrinsic operation
+def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
+    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
+    [4, 5], ["fi", "boundControl"]>,
+  Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
+             I1Attr:$fi, I1Attr:$boundControl)> {
+  let results = (outs LLVM_Type:$res);
+  let assemblyFormat = [{
+    attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0) `,` type($src1)
+  }];
+  let description = [{
+    Performs a `permlanex16` operation with the given operands, applying the
+    permutation specified by $fi to the provided inputs.
+  }];
+}
+
 def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
                         BuildableType<"::mlir::VectorType::get("
                           "{2},$_builder.getI16Type())">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5f697bdeef566..4d343c8f3200c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,6 +1293,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
+      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1350,6 +1351,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
+    case DPPPerm::row_share:
+      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
+        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
+      }
+      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 549a4376a4a04..af4438f028542 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,6 +468,20 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
+
+  case DPPPerm::row_share: {
+    if (!permArgument) {
+      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
+                         "' value not specified");
+    }
+    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
+      uint32_t attrValue = intAttr.getInt();
+      if (attrValue < 0 || attrValue > 15) {
+        return emitOpError(
+            "Attribute value for 'row_share' must be between 0 and 15");
+      }
+    }
+  } break;
   }
   return success();
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 14691e73e62d7..64b3328b70ab4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
+
+func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
+  // CHECK-LABEL: func @dpp_row_share  
+  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
+  // CHECK: return %0 : i32  
+  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
+  return %0 : i32  
+}

>From a5dd442fbed9dd88198be96ff3e2c6ca780f579d Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 13:04:08 -0500
Subject: [PATCH 2/2] Updating tests and removing row share related changes

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td      |  4 +---
 .../lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp |  6 ------
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp       | 14 --------------
 mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir        |  8 --------
 mlir/test/Dialect/LLVMIR/rocdl.mlir                | 10 ++++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir                 | 14 ++++++++++++++
 6 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 17c1162170073..108d7237ff703 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,8 +524,7 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>,
-      I32EnumAttrCase<"row_share", 12>
+      I32EnumAttrCase<"row_bcast_31", 11>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -558,7 +557,6 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
-    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4d343c8f3200c..5f697bdeef566 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,7 +1293,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
-      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1351,11 +1350,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
-    case DPPPerm::row_share:
-      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
-        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
-      }
-      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index af4438f028542..549a4376a4a04 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,20 +468,6 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
-
-  case DPPPerm::row_share: {
-    if (!permArgument) {
-      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
-                         "' value not specified");
-    }
-    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
-      uint32_t attrValue = intAttr.getInt();
-      if (attrValue < 0 || attrValue > 15) {
-        return emitOpError(
-            "Attribute value for 'row_share' must be between 0 and 15");
-      }
-    }
-  } break;
   }
   return success();
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 64b3328b70ab4..14691e73e62d7 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,11 +137,3 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
-
-func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
-  // CHECK-LABEL: func @dpp_row_share  
-  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
-  // CHECK: return %0 : i32  
-  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
-  return %0 : i32  
-}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 828fd58544597..cda1a9ca5f1f6 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -889,6 +889,16 @@ llvm.func @rocdl.readlane(%src : f32) -> f32 {
 
 // -----
 
+llvm.func @rocdl.permlanex16(%src : f32) -> f32 {
+  %cst0 = llvm.mlir.constant(-1 : i32) : i32
+  // CHECK-LABEL: rocdl.permlanex16
+  // CHECK: rocdl.permlanex16 %{{.*}} %{{.*}}
+  %ret = rocdl.permlanex16 %src, %src, %cst0, %cst0, 0, -1 : f32, i32
+  llvm.return %ret : f32
+}
+
+// -----
+
 // expected-error at below {{attribute attached to unexpected op}}
 func.func private @expected_llvm_func() attributes { rocdl.kernel }
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index e70617bfff99e..3db1f7b2b6427 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -872,6 +872,20 @@ llvm.func @rocdl.make.buffer.rsrc.p7.p1(%ptr : !llvm.ptr<1>,
   llvm.return %rsrc : !llvm.ptr<7>
 }
 
+llvm.func @rocdl.permlanex16(%src0 : f32, %src1 : i32, %src2 : vector<2 x f32>, %src3 : vector<2 x i32>) -> f32 {
+  %cst0 = llvm.mlir.constant(-1 : i32) : i32
+  // CHECK-LABEL: rocdl.permlanex16
+  // CHECK: call float @llvm.amdgcn.permlanex16.f32(float %{{.*}}, float %{{.*}}, i32 -1, i32 -1, i1 false, i1 true)
+  %ret0 = rocdl.permlanex16 %src0, %src0, %cst0, %cst0, 0, -1 : f32, i32
+  // CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i32 -1, i1 false, i1 true)
+  %ret1 = rocdl.permlanex16 %src1, %src1, %cst0, %cst0, 0, -1 : i32, i32
+  // CHECK: call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}, i32 -1, i32 -1, i1 false, i1 true)
+  %ret2 = rocdl.permlanex16 %src2, %src2, %cst0, %cst0, 0, -1 : vector<2 x f32>, i32
+  // CHECK: call <2 x i32> @llvm.amdgcn.permlanex16.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, i32 -1, i32 -1, i1 false, i1 true)
+  %ret3 = rocdl.permlanex16 %src3, %src3, %cst0, %cst0, 0, -1 : vector<2 x i32>, i32
+  llvm.return %ret0 : f32
+}
+
 llvm.func @rocdl.wmma.fp8(%arg0 : vector<2 x i32>, %arg1 : vector<8xf32>) -> vector<8xf32> {
   // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}})
   %r0 = rocdl.wmma.f32.16x16x16.fp8_fp8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>