[Mlir-commits] [mlir] [mlir][AMDGPU] Improve DPP implementation of subgroup reduction (PR #136804)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Apr 22 20:54:58 PDT 2025
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {clang-format}-->
:warning: C/C++ code formatter, clang-format found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
git-clang-format --diff HEAD~1 HEAD --extensions h,cpp -- mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp mlir/include/mlir/Dialect/GPU/Transforms/Passes.h mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp mlir/lib/Dialect/GPU/Utils/Utils.cpp mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
``````````
</details>
<details>
<summary>
View the diff from clang-format here.
</summary>
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
index f766dab8c..3c42b1f4e 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -9,9 +9,9 @@
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -27,7 +27,7 @@ struct ClusterInfo {
};
FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
- unsigned subgroupSize);
+ unsigned subgroupSize);
FailureOr<Value>
createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 57af63cbe..7f5e38b79 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -161,7 +161,8 @@ struct ScalarizeSingleElementReduce final
// std::optional<uint32_t> clusterSize = op.getClusterSize();
// assert(!clusterSize ||
-// llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+// llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught
+// this.
// if (clusterSize && *clusterSize > subgroupSize)
// return op.emitOpError()
// << "cluster size " << *clusterSize
@@ -169,8 +170,8 @@ struct ScalarizeSingleElementReduce final
// unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
// auto clusterStride = op.getClusterStride();
-// assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-// if (clusterStride >= subgroupSize)
+// assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught
+// this. if (clusterStride >= subgroupSize)
// return op.emitOpError()
// << "cluster stride " << clusterStride
// << " is not less than subgroup size " << subgroupSize;
@@ -369,7 +370,8 @@ private:
};
// FailureOr<Value>
-// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp
+// op,
// Value input, gpu::AllReduceOperation mode,
// const ClusterInfo &ci, amdgpu::Chipset chipset) {
// Location loc = op.getLoc();
@@ -382,18 +384,22 @@ private:
// // Perform reduction between all lanes N <-> N+1.
// dpp = rewriter.create<amdgpu::DPPOp>(
// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-// rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+// rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks,
+// boundCtrl);
// res = vector::makeArithReduction(rewriter, loc,
-// gpu::convertReductionKind(mode), res, dpp);
+// gpu::convertReductionKind(mode), res,
+// dpp);
// }
// if (ci.clusterSize >= 4) {
// // Perform reduction between all lanes N <-> N+2.
// dpp = rewriter.create<amdgpu::DPPOp>(
// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-// rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+// rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks,
+// boundCtrl);
// res = vector::makeArithReduction(rewriter, loc,
-// gpu::convertReductionKind(mode), res, dpp);
+// gpu::convertReductionKind(mode), res,
+// dpp);
// }
// if (ci.clusterSize >= 8) {
// // Perform reduction between all lanes N <-> 7-N,
@@ -402,16 +408,18 @@ private:
// loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
// res = vector::makeArithReduction(rewriter, loc,
-// gpu::convertReductionKind(mode), res, dpp);
+// gpu::convertReductionKind(mode), res,
+// dpp);
// }
// if (ci.clusterSize >= 16) {
// // Perform reduction between all lanes N <-> 15-N,
-// // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-// dpp = rewriter.create<amdgpu::DPPOp>(
+// // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <->
+// lane[8]. dpp = rewriter.create<amdgpu::DPPOp>(
// loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
// res = vector::makeArithReduction(rewriter, loc,
-// gpu::convertReductionKind(mode), res, dpp);
+// gpu::convertReductionKind(mode), res,
+// dpp);
// }
// if (ci.clusterSize >= 32) {
// if (chipset.majorVersion <= 9) {
@@ -427,7 +435,8 @@ private:
// // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
// Value uint32Max = rewriter.create<arith::ConstantOp>(
// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-// dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+// dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res,
+// res,
// uint32Max, uint32Max,
// /*fi=*/true,
// /*bound_ctrl=*/false);
@@ -437,7 +446,8 @@ private:
// Value lane0 = rewriter.create<arith::ConstantOp>(
// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
// res =
-// rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+// rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+// lane0);
// }
// } else {
// return rewriter.notifyMatchFailure(
@@ -462,15 +472,17 @@ private:
// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
// Value lane32 = rewriter.create<arith::ConstantOp>(
// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
-// dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-// res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+// dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+// lane32); res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(),
+// res, lane0);
// } else {
// return rewriter.notifyMatchFailure(
// op, "Subgroup reduce lowering to DPP not currently supported for "
// "this device.");
// }
// res = vector::makeArithReduction(rewriter, loc,
-// gpu::convertReductionKind(mode), res, dpp);
+// gpu::convertReductionKind(mode), res,
+// dpp);
// }
// assert(res.getType() == input.getType());
// return res;
@@ -484,8 +496,9 @@ struct ScalarSubgroupReduceToDPP final
ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
unsigned shuffleBitwidth, bool matchClustered,
amdgpu::Chipset chipset, PatternBenefit benefit)
- : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
- matchClustered(matchClustered), chipset(chipset) {}
+ : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+ shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered),
+ chipset(chipset) {}
LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
PatternRewriter &rewriter) const override {
@@ -540,8 +553,9 @@ struct ScalarSubgroupReduceToDPP final
return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
};
- FailureOr<Value> dpp = createSubgroupDPPReduction(
- rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
+ FailureOr<Value> dpp =
+ createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), *ci,
+ chipset, packFn, unpackFn);
if (failed(dpp))
return failure();
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
index 255c4152b..a310da013 100644
--- a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -10,11 +10,11 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Value.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
@@ -24,7 +24,7 @@
using namespace mlir;
FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
- unsigned subgroupSize) {
+ unsigned subgroupSize) {
assert(llvm::isPowerOf2_32(subgroupSize));
std::optional<uint32_t> clusterSize = op.getClusterSize();
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 4ebcf897f..fd8b34288 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -93,9 +93,11 @@ struct TestGpuSubgroupReduceLoweringPass
auto maybeChipset = amdgpu::Chipset::parse(target);
if (succeeded(maybeChipset)) {
populateGpuLowerSubgroupReduceToDPPPatterns(
- patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+ patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+ *maybeChipset, PatternBenefit(2));
populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
- patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+ patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+ *maybeChipset, PatternBenefit(2));
}
populateGpuLowerSubgroupReduceToShufflePatterns(
patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
``````````
</details>
https://github.com/llvm/llvm-project/pull/136804
More information about the Mlir-commits
mailing list