[Mlir-commits] [mlir] [mlir][AMDGPU] Improve DPP implementation of subgroup reduction (PR #136804)

Tue Apr 22 20:54:58 PDT 2025

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff HEAD~1 HEAD --extensions h,cpp -- mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp mlir/include/mlir/Dialect/GPU/Transforms/Passes.h mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp mlir/lib/Dialect/GPU/Utils/Utils.cpp mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
``````````

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff

diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
index f766dab8c..3c42b1f4e 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -27,7 +27,7 @@ struct ClusterInfo {
 };
 
 FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
-  unsigned subgroupSize);
+                                                 unsigned subgroupSize);
 
 FailureOr<Value>
 createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 57af63cbe..7f5e38b79 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -161,7 +161,8 @@ struct ScalarizeSingleElementReduce final
 
 //   std::optional<uint32_t> clusterSize = op.getClusterSize();
 //   assert(!clusterSize ||
-//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught
+//          this.
 //   if (clusterSize && *clusterSize > subgroupSize)
 //     return op.emitOpError()
 //            << "cluster size " << *clusterSize
@@ -169,8 +170,8 @@ struct ScalarizeSingleElementReduce final
 //   unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
 
 //   auto clusterStride = op.getClusterStride();
-//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-//   if (clusterStride >= subgroupSize)
+//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught
+//   this. if (clusterStride >= subgroupSize)
 //     return op.emitOpError()
 //            << "cluster stride " << clusterStride
 //            << " is not less than subgroup size " << subgroupSize;
@@ -369,7 +370,8 @@ private:
 };
 
 // FailureOr<Value>
-// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp
+// op,
 //                            Value input, gpu::AllReduceOperation mode,
 //                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
 //   Location loc = op.getLoc();
@@ -382,18 +384,22 @@ private:
 //     // Perform reduction between all lanes N <-> N+1.
 //     dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks,
+//         boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 
 //   if (ci.clusterSize >= 4) {
 //     // Perform reduction between all lanes N <-> N+2.
 //     dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks,
+//         boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 8) {
 //     // Perform reduction between all lanes N <-> 7-N,
@@ -402,16 +408,18 @@ private:
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
 //         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 16) {
 //     // Perform reduction between all lanes N <-> 15-N,
-//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-//     dpp = rewriter.create<amdgpu::DPPOp>(
+//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <->
+//     lane[8]. dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
 //         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 32) {
 //     if (chipset.majorVersion <= 9) {
@@ -427,7 +435,8 @@ private:
 //       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
 //       Value uint32Max = rewriter.create<arith::ConstantOp>(
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res,
+//       res,
 //                                                   uint32Max, uint32Max,
 //                                                   /*fi=*/true,
 //                                                   /*bound_ctrl=*/false);
@@ -437,7 +446,8 @@ private:
 //         Value lane0 = rewriter.create<arith::ConstantOp>(
 //             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
 //         res =
-//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+//             lane0);
 //       }
 //     } else {
 //       return rewriter.notifyMatchFailure(
@@ -462,15 +472,17 @@ private:
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
 //       Value lane32 = rewriter.create<arith::ConstantOp>(
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
-//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-//       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+//       lane32); res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(),
+//       res, lane0);
 //     } else {
 //       return rewriter.notifyMatchFailure(
 //           op, "Subgroup reduce lowering to DPP not currently supported for "
 //               "this device.");
 //     }
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   assert(res.getType() == input.getType());
 //   return res;
@@ -484,8 +496,9 @@ struct ScalarSubgroupReduceToDPP final
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
                             unsigned shuffleBitwidth, bool matchClustered,
                             amdgpu::Chipset chipset, PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
-        matchClustered(matchClustered), chipset(chipset) {}
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered),
+        chipset(chipset) {}
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -540,8 +553,9 @@ struct ScalarSubgroupReduceToDPP final
       return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
     };
 
-    FailureOr<Value> dpp = createSubgroupDPPReduction(
-        rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
+    FailureOr<Value> dpp =
+        createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), *ci,
+                                   chipset, packFn, unpackFn);
     if (failed(dpp))
       return failure();
 
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
index 255c4152b..a310da013 100644
--- a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -24,7 +24,7 @@
 using namespace mlir;
 
 FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
-                                                 unsigned subgroupSize) {
+                                                       unsigned subgroupSize) {
   assert(llvm::isPowerOf2_32(subgroupSize));
 
   std::optional<uint32_t> clusterSize = op.getClusterSize();
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 4ebcf897f..fd8b34288 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -93,9 +93,11 @@ struct TestGpuSubgroupReduceLoweringPass
       auto maybeChipset = amdgpu::Chipset::parse(target);
       if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+            *maybeChipset, PatternBenefit(2));
         populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+            *maybeChipset, PatternBenefit(2));
       }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);

``````````

</details>


https://github.com/llvm/llvm-project/pull/136804