[Mlir-commits] [mlir] [mlir][AMDGPU] Improve DPP implementation of subgroup reduction (PR #136804)

Tue Apr 22 20:56:48 PDT 2025

https://github.com/Muzammiluddin-Syed-ECE updated https://github.com/llvm/llvm-project/pull/136804

>From 029b2ccce15d08900dd3aeaed1968e1b011fb6f0 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 25 Mar 2025 14:04:06 -0500
Subject: [PATCH 01/28] Creates AMDToGPUPass to house a subgroup reduce
 lowering pattern to DPP ops.

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h |  32 ++++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |  16 ++
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt |  22 +++
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    | 176 ++++++++++++++++++
 mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt |   1 +
 7 files changed, 249 insertions(+)
 create mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
 create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
 create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp

diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
new file mode 100644
index 0000000000000..2d3bb384235ca
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
@@ -0,0 +1,32 @@
+//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+
+
+#include "mlir/IR/PatternMatch.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+
+class LLVMTypeConverter;
+class RewritePatternSet;
+class TypeConverter;
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+                                            unsigned subgroupSize,
+                                            PatternBenefit benefit);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index ccd862f67c068..1189423799092 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,6 +34,7 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..b28b4900e6814 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// GPUToAMDGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
+  let summary = "Generate AMDGPU operations for gpu operations";
+  let dependentDialects = [
+    "amdgpu::AMDGPUDialect",
+    "LLVM::LLVMDialect",
+    "ROCDL::ROCDLDialect",
+  ];
+  let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
+                        /*default=*/"64",
+                        "Size of subgroup">];
+}
+
 //===----------------------------------------------------------------------===//
 // ConvertIndexToLLVMPass
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b6c21440c571c..b957a4473f1e6 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
 add_subdirectory(GPUCommon)
+add_subdirectory(GPUToAMDGPU)
 add_subdirectory(GPUToLLVMSPV)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
new file mode 100644
index 0000000000000..9b82b5dc63d9c
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_mlir_conversion_library(MLIRGPUToAMDGPU
+  GPUToAMDGPU.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
+  
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRGPUDialect
+  MLIRAMDGPUDialect
+  MLIRAMDGPUUtils
+  MLIRROCDLDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
new file mode 100644
index 0000000000000..bab83c12157a9
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -0,0 +1,176 @@
+//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci) {
+  Value result = input;
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 32) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 10, 15, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize == 64) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), 12, 15, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  auto int32Type = IntegerType::get(b.getContext(), 32);
+  Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+  result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+  assert(result.getType() == input.getType());
+  return result;
+}
+
+struct ScalarSubgroupReduceToShuffles final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+                                 bool matchClustered, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        matchClustered(matchClustered) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+
+    Location loc = op.getLoc();
+    rewriter.replaceOp(op, createSubgroupDPPReduction(
+                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  bool matchClustered = false;
+};
+
+struct ConvertGPUToAMDGPUPass
+    : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    int subgroupSizeInt = static_cast<int>(subgroupSize);
+    populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
+                                           PatternBenefit(1));
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+} // namespace
+
+void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+                                                  unsigned subgroupSize,
+                                                  PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToShuffles>(
+      patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
+}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 945e3ccdfa87b..52484ac69a3e2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
   MLIRMathToLLVM
   MLIRMathToROCDL
   MLIRAMDGPUToROCDL
+  MLIRGPUToAMDGPU
   MLIRFuncToLLVM
   MLIRGPUDialect
   MLIRGPUToGPURuntimeTransforms

>From 427c81705a1be5178cacbe50a213d5b3ee9f68b3 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 2 Apr 2025 17:48:56 -0500
Subject: [PATCH 02/28] Fix for numerical issues in MatVec tests

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index bab83c12157a9..b07ed0a7c636a 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -82,26 +82,31 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 8) {
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr());
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 16) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+                                amdgpu::DPPPerm::row_shr, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+
   if (ci.clusterSize >= 32) {
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 10, 15, false);
+        b.getUnitAttr(), 0xa, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -110,7 +115,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), 12, 15, false);
+        b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }

>From 655251b5fd2713b3eacb38953425c5d71288beb6 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:08:59 -0500
Subject: [PATCH 03/28] Rewrites pattern to be closer to device lib impl.

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |   7 ++
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    |  27 ++---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 109 ++++++++++++++++++
 3 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5cc65082a7e56..41e0759e958b5 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -62,6 +62,13 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
+                                                 unsigned subgroupSize,
+                                                 PatternBenefit benefit = 1);
+
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index b07ed0a7c636a..590fa7d9b4ffc 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -67,7 +67,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -76,39 +76,41 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 8) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 16) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-
+  auto int32Type = IntegerType::get(b.getContext(), 32);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
         b.getUnitAttr(), 0xa, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
+    result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);    
+    }
   }
 
   if (ci.clusterSize == 64) {
@@ -118,11 +120,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
-  auto int32Type = IntegerType::get(b.getContext(), 32);
-  Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
-  result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   assert(result.getType() == input.getType());
   return result;
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 43eff3eddcc49..f07ef6cf154a9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -12,6 +12,8 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -362,6 +364,106 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci) {
+  Value result = input;
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+  auto int32Type = IntegerType::get(b.getContext(), 32);
+  if (ci.clusterSize >= 32) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 0xa, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result =
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+    }
+  }
+
+  if (ci.clusterSize == 64) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), allRows, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+  }
+
+  assert(result.getType() == input.getType());
+  return result;
+}
+
+struct ScalarSubgroupReduceToDPP final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
+                            bool matchClustered, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        matchClustered(matchClustered) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+    Location loc = op.getLoc();
+    rewriter.replaceOp(op, createSubgroupDPPReduction(
+                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  bool matchClustered = false;
+};
 } // namespace
 
 void mlir::populateGpuBreakDownSubgroupReducePatterns(
@@ -372,6 +474,13 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
   patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
 }
 
+void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+                                          /*matchClustered=*/true, benefit);
+}
+
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth, PatternBenefit benefit) {

>From 081d6f77b9331366fd332e4c42d192df003dbfe9 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:15:36 -0500
Subject: [PATCH 04/28] Removes AMDToGPUPass, moving pattern into existing pass

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h |  32 ---
 mlir/include/mlir/Conversion/Passes.h         |   1 -
 mlir/include/mlir/Conversion/Passes.td        |  16 --
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt |  22 ---
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    | 182 ------------------
 mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt |   1 -
 7 files changed, 255 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
 delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
 delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp

diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
deleted file mode 100644
index 2d3bb384235ca..0000000000000
--- a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-
-
-#include "mlir/IR/PatternMatch.h"
-#include <memory>
-#include <string>
-
-namespace mlir {
-
-class LLVMTypeConverter;
-class RewritePatternSet;
-class TypeConverter;
-class Pass;
-
-#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-
-void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
-                                            unsigned subgroupSize,
-                                            PatternBenefit benefit);
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 1189423799092..ccd862f67c068 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,7 +34,6 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index b28b4900e6814..bbba495e613b2 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,22 +643,6 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// GPUToAMDGPU
-//===----------------------------------------------------------------------===//
-
-def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
-  let summary = "Generate AMDGPU operations for gpu operations";
-  let dependentDialects = [
-    "amdgpu::AMDGPUDialect",
-    "LLVM::LLVMDialect",
-    "ROCDL::ROCDLDialect",
-  ];
-  let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
-                        /*default=*/"64",
-                        "Size of subgroup">];
-}
-
 //===----------------------------------------------------------------------===//
 // ConvertIndexToLLVMPass
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b957a4473f1e6..b6c21440c571c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,7 +24,6 @@ add_subdirectory(FuncToEmitC)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
 add_subdirectory(GPUCommon)
-add_subdirectory(GPUToAMDGPU)
 add_subdirectory(GPUToLLVMSPV)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
deleted file mode 100644
index 9b82b5dc63d9c..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-add_mlir_conversion_library(MLIRGPUToAMDGPU
-  GPUToAMDGPU.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
-  
-  DEPENDS
-  MLIRConversionPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRLLVMCommonConversion
-  MLIRLLVMDialect
-  MLIRGPUDialect
-  MLIRAMDGPUDialect
-  MLIRAMDGPUUtils
-  MLIRROCDLDialect
-  MLIRPass
-  MLIRTransforms
-  )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
deleted file mode 100644
index 590fa7d9b4ffc..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Pass/Pass.h"
-
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/Support/FormatVariadic.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-
-namespace {
-struct ClusterInfo {
-  unsigned clusterStride;
-  unsigned clusterSize;
-  unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
-  assert(llvm::isPowerOf2_32(subgroupSize));
-
-  std::optional<uint32_t> clusterSize = op.getClusterSize();
-  assert(!clusterSize ||
-         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
-  if (clusterSize && *clusterSize > subgroupSize)
-    return op.emitOpError()
-           << "cluster size " << *clusterSize
-           << " is greater than subgroup size " << subgroupSize;
-  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
-  auto clusterStride = op.getClusterStride();
-  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-  if (clusterStride >= subgroupSize)
-    return op.emitOpError()
-           << "cluster stride " << clusterStride
-           << " is not less than subgroup size " << subgroupSize;
-
-  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
-
-Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
-                                 gpu::AllReduceOperation mode,
-                                 const ClusterInfo &ci) {
-  Value result = input;
-  if (ci.clusterSize >= 2) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 4) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 8) {
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 16) {
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  const int allRows = 0xf;
-  const int allBanks = 0xf;
-  auto int32Type = IntegerType::get(b.getContext(), 32);
-  if (ci.clusterSize >= 32) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-    result, dppResult);
-    if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
-      result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);    
-    }
-  }
-
-  if (ci.clusterSize == 64) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), allRows, allBanks, false);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
-  }
-
-  assert(result.getType() == input.getType());
-  return result;
-}
-
-struct ScalarSubgroupReduceToShuffles final
-    : OpRewritePattern<gpu::SubgroupReduceOp> {
-  ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
-                                 bool matchClustered, PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
-        matchClustered(matchClustered) {}
-
-  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
-                                PatternRewriter &rewriter) const override {
-    if (op.getClusterSize().has_value() != matchClustered) {
-      return rewriter.notifyMatchFailure(
-          op, llvm::formatv("op is {0}clustered but pattern is configured to "
-                            "only match {1}clustered ops",
-                            matchClustered ? "non-" : "",
-                            matchClustered ? "" : "non-"));
-    }
-
-    auto ci = getAndValidateClusterInfo(op, subgroupSize);
-    if (failed(ci))
-      return failure();
-
-    Location loc = op.getLoc();
-    rewriter.replaceOp(op, createSubgroupDPPReduction(
-                               rewriter, loc, op.getValue(), op.getOp(), *ci));
-    return success();
-  }
-
-private:
-  unsigned subgroupSize = 0;
-  bool matchClustered = false;
-};
-
-struct ConvertGPUToAMDGPUPass
-    : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    int subgroupSizeInt = static_cast<int>(subgroupSize);
-    populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
-                                           PatternBenefit(1));
-    walkAndApplyPatterns(getOperation(), std::move(patterns));
-  }
-};
-} // namespace
-
-void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
-                                                  unsigned subgroupSize,
-                                                  PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToShuffles>(
-      patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
-}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 52484ac69a3e2..945e3ccdfa87b 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,7 +15,6 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
   MLIRMathToLLVM
   MLIRMathToROCDL
   MLIRAMDGPUToROCDL
-  MLIRGPUToAMDGPU
   MLIRFuncToLLVM
   MLIRGPUDialect
   MLIRGPUToGPURuntimeTransforms

>From 0d560c219bf11d4b5e6b9eb3eff7680c66a6ba5e Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 10 Apr 2025 14:06:51 -0500
Subject: [PATCH 05/28] Adding permlanex16 and other dpp related ops to mlir
 dialect

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  4 +++-
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  6 ++++++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 14 +++++++++++++
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 21 +++++++++++--------
 mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir   |  8 +++++++
 5 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 108d7237ff703..17c1162170073 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>
+      I32EnumAttrCase<"row_bcast_31", 11>,
+      I32EnumAttrCase<"row_share", 12>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
+    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5f697bdeef566..4d343c8f3200c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,6 +1293,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
+      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1350,6 +1351,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
+    case DPPPerm::row_share:
+      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
+        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
+      }
+      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 549a4376a4a04..af4438f028542 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,6 +468,20 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
+
+  case DPPPerm::row_share: {
+    if (!permArgument) {
+      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
+                         "' value not specified");
+    }
+    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
+      uint32_t attrValue = intAttr.getInt();
+      if (attrValue < 0 || attrValue > 15) {
+        return emitOpError(
+            "Attribute value for 'row_share' must be between 0 and 15");
+      }
+    }
+  } break;
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index f07ef6cf154a9..3e64681ad2dd2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  const ClusterInfo &ci) {
   Value result = input;
   if (ci.clusterSize >= 2) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    auto permArg = b.getI32IntegerAttr(1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 4) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    auto permArg = b.getI32IntegerAttr(2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-  auto int32Type = IntegerType::get(b.getContext(), 32);
+  auto uint32Type = b.getIntegerType(32, false);
   if (ci.clusterSize >= 32) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
+    // auto permArg = b.getI32IntegerAttr(15);
+    // Value dppResult = b.create<amdgpu::DPPOp>(
+    //     loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+    //     b.getUnitAttr(), 0xa, allBanks, false);
+    auto uIntMax = llvm::APInt::getMaxValue(32u);
+    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+    Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
       result =
           b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
     }
@@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
     result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 14691e73e62d7..64b3328b70ab4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
+
+func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
+  // CHECK-LABEL: func @dpp_row_share  
+  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
+  // CHECK: return %0 : i32  
+  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
+  return %0 : i32  
+}

>From 015e9b9353df71200cff96d75f84fa3c583101b1 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 10:30:10 -0500
Subject: [PATCH 06/28] Fixing permlanex16 intrinsic failure

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 3e64681ad2dd2..b6bd67fa0ce53 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -405,14 +405,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-  auto uint32Type = b.getIntegerType(32, false);
   if (ci.clusterSize >= 32) {
-    // auto permArg = b.getI32IntegerAttr(15);
-    // Value dppResult = b.create<amdgpu::DPPOp>(
-    //     loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-    //     b.getUnitAttr(), 0xa, allBanks, false);
     auto uIntMax = llvm::APInt::getMaxValue(32u);
-    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
     Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);

>From 945f0e83e96722b8dbecd1317baced566e8b3ff8 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 11:27:53 -0500
Subject: [PATCH 07/28] simplify verbose typing

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b6bd67fa0ce53..b9eae59584e94 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -419,7 +419,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize == 64) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    auto permArg = b.getI32IntegerAttr(31);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
         b.getUnitAttr(), allRows, allBanks, false);

>From 1b356ed68d3a5f2067736a7ad3dc437fea31a7fc Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 22:13:11 -0500
Subject: [PATCH 08/28] testing numerics

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 59 ++++++++++++-------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b9eae59584e94..0790edc15921e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -369,46 +369,63 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  gpu::AllReduceOperation mode,
                                  const ClusterInfo &ci) {
   Value result = input;
+  Value dppResult;
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+  const bool boundCtrl = true;
   if (ci.clusterSize >= 2) {
     auto permArg = b.getI32IntegerAttr(1);
-    Value dppResult =
+    dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 4) {
     auto permArg = b.getI32IntegerAttr(2);
-    Value dppResult =
+    dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
-  if (ci.clusterSize >= 8) {
-    Value dppResult = b.create<amdgpu::DPPOp>(
+  if (ci.clusterSize <= 8) {
+    dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
+        b.getUnitAttr(), allRows, allBanks, boundCtrl);
+  } else if (ci.clusterSize == 8) {
+    auto permArg = b.getI32IntegerAttr(4);
+    dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
   }
-
-  if (ci.clusterSize >= 16) {
-    Value dppResult =
+  result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                          result, dppResult);
+
+  if (ci.clusterSize <= 16) {
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
+        b.getUnitAttr(), allRows, allBanks, boundCtrl);
+  } else if (ci.clusterSize == 16) {
+    auto permArg = b.getI32IntegerAttr(8);
+    dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
+                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
   }
+  result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
 
-  const int allRows = 0xf;
-  const int allBanks = 0xf;
   if (ci.clusterSize >= 32) {
-    auto uIntMax = llvm::APInt::getMaxValue(32u);
-    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
-    Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
+    auto permArg = b.getI32IntegerAttr(15);
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 0xa, allBanks, false);
+    // if (chipset.majorVersion == 9)
+    // auto uIntMax = llvm::APInt::getMaxValue(32u);
+    // Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
+    // Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
@@ -420,7 +437,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   if (ci.clusterSize == 64) {
     auto permArg = b.getI32IntegerAttr(31);
-    Value dppResult = b.create<amdgpu::DPPOp>(
+    dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),

>From 7fd30c051c743dde370616db6d0942f9dfac03d2 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 22:44:39 -0500
Subject: [PATCH 09/28] fixing

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0790edc15921e..b47553e41c501 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -391,31 +391,35 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
   }
 
-  if (ci.clusterSize <= 8) {
+  if (ci.clusterSize == 8) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
         b.getUnitAttr(), allRows, allBanks, boundCtrl);
-  } else if (ci.clusterSize == 8) {
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  } else if (ci.clusterSize >= 8) {
     auto permArg = b.getI32IntegerAttr(4);
-    dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::row_shl, permArg,
+                                        allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
   }
-  result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                          result, dppResult);
 
-  if (ci.clusterSize <= 16) {
+  if (ci.clusterSize == 16) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
         b.getUnitAttr(), allRows, allBanks, boundCtrl);
-  } else if (ci.clusterSize == 16) {
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  } else if (ci.clusterSize >= 16) {
     auto permArg = b.getI32IntegerAttr(8);
-    dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
-  }
-  result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::row_shl, permArg,
+                                        allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
+  }
 
   if (ci.clusterSize >= 32) {
     auto permArg = b.getI32IntegerAttr(15);

>From 0c28b4d08ff6158908c498f32341faaddb6e4909 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 23:16:43 -0500
Subject: [PATCH 10/28] fixing

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b47553e41c501..889c378ab0a9f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
-
+  Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getI32IntegerAttr(15);
     dppResult = b.create<amdgpu::DPPOp>(
@@ -433,7 +433,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
       result =
           b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
     }
@@ -446,8 +445,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+    // Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
   }
 
   assert(result.getType() == input.getType());

>From bfda71216024b6c14166f9eb988a07518a66548b Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 23:32:43 -0500
Subject: [PATCH 11/28] fixing

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 889c378ab0a9f..d774197dc6d15 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
-  Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
+  Value lane00 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 0);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getI32IntegerAttr(15);
     dppResult = b.create<amdgpu::DPPOp>(
@@ -434,7 +434,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
       result =
-          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
     }
   }
 
@@ -446,7 +446,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     // Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
   }
 
   assert(result.getType() == input.getType());

>From 54c08ef52b4ba507c9314fb2e89f00fd77d2fa85 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Sun, 13 Apr 2025 22:17:55 -0500
Subject: [PATCH 12/28] trying again

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 67 ++++++++++++-------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index d774197dc6d15..8dd637b28d4ae 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -374,53 +374,71 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   const int allBanks = 0xf;
   const bool boundCtrl = true;
   if (ci.clusterSize >= 2) {
-    auto permArg = b.getI32IntegerAttr(1);
+    // auto permArg = b.getI32IntegerAttr(1);
+    auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
     dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+                                amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 4) {
-    auto permArg = b.getI32IntegerAttr(2);
+    // auto permArg = b.getI32IntegerAttr(2);
+    auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
     dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+                                amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
-  if (ci.clusterSize == 8) {
+  // if (ci.clusterSize == 8) {
+  //   dppResult = b.create<amdgpu::DPPOp>(
+  //       loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+  //       b.getUnitAttr(), allRows, allBanks, boundCtrl);
+  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+  //                                       result, dppResult);
+  // } else if (ci.clusterSize >= 8) {
+  //   auto permArg = b.getI32IntegerAttr(4);
+  //   dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+  //                                       amdgpu::DPPPerm::row_shr, permArg,
+  //                                       allRows, allBanks, boundCtrl);
+  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+  //                                       result, dppResult);
+  // }
+  if (ci.clusterSize >= 8) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
         b.getUnitAttr(), allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-  } else if (ci.clusterSize >= 8) {
-    auto permArg = b.getI32IntegerAttr(4);
-    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                        amdgpu::DPPPerm::row_shl, permArg,
-                                        allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
   }
 
-  if (ci.clusterSize == 16) {
+  // if (ci.clusterSize == 16) {
+  //   dppResult = b.create<amdgpu::DPPOp>(
+  //       loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
+  //       b.getUnitAttr(), allRows, allBanks, boundCtrl);
+  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+  //                                       result, dppResult);
+  // } else if (ci.clusterSize >= 16) {
+  //   auto permArg = b.getI32IntegerAttr(8);
+  //   dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+  //                                       amdgpu::DPPPerm::row_shr, permArg,
+  //                                       allRows, allBanks, boundCtrl);
+  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+  //                                       result, dppResult);
+  // }
+  if (ci.clusterSize >= 16) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
         b.getUnitAttr(), allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-  } else if (ci.clusterSize >= 16) {
-    auto permArg = b.getI32IntegerAttr(8);
-    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                        amdgpu::DPPPerm::row_shl, permArg,
-                                        allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
   }
-  Value lane00 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 0);
+
+  Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
+  Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getI32IntegerAttr(15);
     dppResult = b.create<amdgpu::DPPOp>(
@@ -434,7 +452,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
       result =
-          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane31);
     }
   }
 
@@ -442,11 +460,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getI32IntegerAttr(31);
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), allRows, allBanks, false);
+        b.getUnitAttr(), 0xc, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-    // Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
   assert(result.getType() == input.getType());

>From 6535bda2437e630ec19953985d59564d05fea336 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 00:59:52 -0500
Subject: [PATCH 13/28] Fixing implementation

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 72 +++++++------------
 1 file changed, 26 insertions(+), 46 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 8dd637b28d4ae..0c923828093b9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -367,14 +368,14 @@ struct VectorSubgroupReduceToShuffles final
 
 Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  gpu::AllReduceOperation mode,
-                                 const ClusterInfo &ci) {
+                                 const ClusterInfo &ci,
+                                 amdgpu::Chipset chipset) {
   Value result = input;
   Value dppResult;
   const int allRows = 0xf;
   const int allBanks = 0xf;
   const bool boundCtrl = true;
   if (ci.clusterSize >= 2) {
-    // auto permArg = b.getI32IntegerAttr(1);
     auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
     dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -384,7 +385,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 4) {
-    // auto permArg = b.getI32IntegerAttr(2);
     auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
     dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -393,20 +393,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
   }
 
-  // if (ci.clusterSize == 8) {
-  //   dppResult = b.create<amdgpu::DPPOp>(
-  //       loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-  //       b.getUnitAttr(), allRows, allBanks, boundCtrl);
-  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-  //                                       result, dppResult);
-  // } else if (ci.clusterSize >= 8) {
-  //   auto permArg = b.getI32IntegerAttr(4);
-  //   dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-  //                                       amdgpu::DPPPerm::row_shr, permArg,
-  //                                       allRows, allBanks, boundCtrl);
-  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-  //                                       result, dppResult);
-  // }
   if (ci.clusterSize >= 8) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
@@ -415,20 +401,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
   }
 
-  // if (ci.clusterSize == 16) {
-  //   dppResult = b.create<amdgpu::DPPOp>(
-  //       loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
-  //       b.getUnitAttr(), allRows, allBanks, boundCtrl);
-  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-  //                                       result, dppResult);
-  // } else if (ci.clusterSize >= 16) {
-  //   auto permArg = b.getI32IntegerAttr(8);
-  //   dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-  //                                       amdgpu::DPPPerm::row_shr, permArg,
-  //                                       allRows, allBanks, boundCtrl);
-  //   result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-  //                                       result, dppResult);
-  // }
   if (ci.clusterSize >= 16) {
     dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
@@ -440,14 +412,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
   Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
   if (ci.clusterSize >= 32) {
-    auto permArg = b.getI32IntegerAttr(15);
-    dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
-    // if (chipset.majorVersion == 9)
-    // auto uIntMax = llvm::APInt::getMaxValue(32u);
-    // Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
-    // Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
+    if (chipset.majorVersion <= 9) {
+      auto permArg = b.getI32IntegerAttr(15);
+      dppResult = b.create<amdgpu::DPPOp>(
+          loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+          b.getUnitAttr(), 0xa, allBanks, false);
+    } else if (chipset.majorVersion == 10) {
+      auto uIntMax = llvm::APInt::getMaxValue(32u);
+      Value uIntMaxConst =
+          b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
+      Value dppResult = b.create<ROCDL::PermlaneX16Op>(
+          loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
+          true, false);
+    }
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
@@ -458,9 +435,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   if (ci.clusterSize == 64) {
     auto permArg = b.getI32IntegerAttr(31);
-    dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), 0xc, allBanks, false);
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::row_bcast_31,
+                                        b.getUnitAttr(), 0xc, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
@@ -473,9 +450,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 struct ScalarSubgroupReduceToDPP final
     : OpRewritePattern<gpu::SubgroupReduceOp> {
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
-                            bool matchClustered, PatternBenefit benefit)
+                            bool matchClustered, Chipset chipset,
+                            PatternBenefit benefit)
       : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
-        matchClustered(matchClustered) {}
+        matchClustered(matchClustered), chipset(chipset) {}
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -498,6 +476,7 @@ struct ScalarSubgroupReduceToDPP final
 private:
   unsigned subgroupSize = 0;
   bool matchClustered = false;
+  Chipset chipset;
 };
 } // namespace
 
@@ -510,10 +489,11 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
 }
 
 void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize,
+    RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset,
     PatternBenefit benefit) {
   patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/true, benefit);
+                                          /*matchClustered=*/true, chipset,
+                                          benefit);
 }
 
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(

>From 85e3b6271bf7568685bc57051a420b9ef02bc5bf Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 16:11:33 -0500
Subject: [PATCH 14/28] Adding DPP test

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |  2 +
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 57 +++++++++++--------
 .../Dialect/GPU/subgroup-reduce-lowering.mlir | 33 +++++++++++
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp  | 24 ++++++--
 4 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 41e0759e958b5..5b185e262deb0 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/IR/PatternMatch.h"
@@ -67,6 +68,7 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
 /// `subgroupSize` lanes. Applicable only to AMD GPUs.
 void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                  unsigned subgroupSize,
+                                                 amdgpu::Chipset chipset,
                                                  PatternBenefit benefit = 1);
 
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0c923828093b9..a327730851ed4 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
+#include <llvm-14/llvm/Support/ErrorHandling.h>
 
 using namespace mlir;
 
@@ -370,25 +371,27 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  gpu::AllReduceOperation mode,
                                  const ClusterInfo &ci,
                                  amdgpu::Chipset chipset) {
-  Value result = input;
   Value dppResult;
+  Value result = input;
   const int allRows = 0xf;
   const int allBanks = 0xf;
   const bool boundCtrl = true;
+  Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
+  Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
   if (ci.clusterSize >= 2) {
     auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
-    dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::quad_perm, permArg,
+                                        allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 4) {
     auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
-    dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::quad_perm, permArg,
+                                        allRows, allBanks, boundCtrl);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -409,19 +412,15 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                         result, dppResult);
   }
 
-  Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
-  Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
   if (ci.clusterSize >= 32) {
     if (chipset.majorVersion <= 9) {
-      auto permArg = b.getI32IntegerAttr(15);
       dppResult = b.create<amdgpu::DPPOp>(
           loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-          b.getUnitAttr(), 0xa, allBanks, false);
+          b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
     } else if (chipset.majorVersion == 10) {
-      auto uIntMax = llvm::APInt::getMaxValue(32u);
       Value uIntMaxConst =
-          b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
-      Value dppResult = b.create<ROCDL::PermlaneX16Op>(
+          b.create<LLVM::ConstantOp>(loc, b.getI32Type(), -1);
+      dppResult = b.create<ROCDL::PermlaneX16Op>(
           loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
           true, false);
     }
@@ -434,10 +433,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize == 64) {
-    auto permArg = b.getI32IntegerAttr(31);
-    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                        amdgpu::DPPPerm::row_bcast_31,
-                                        b.getUnitAttr(), 0xc, allBanks, false);
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
@@ -447,10 +445,13 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   return result;
 }
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
 struct ScalarSubgroupReduceToDPP final
     : OpRewritePattern<gpu::SubgroupReduceOp> {
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
-                            bool matchClustered, Chipset chipset,
+                            bool matchClustered, amdgpu::Chipset chipset,
                             PatternBenefit benefit)
       : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
         matchClustered(matchClustered), chipset(chipset) {}
@@ -467,16 +468,26 @@ struct ScalarSubgroupReduceToDPP final
     auto ci = getAndValidateClusterInfo(op, subgroupSize);
     if (failed(ci))
       return failure();
+
+    if (ci->clusterStride != 1)
+      return failure();
+
+    Type valueTy = op.getType();
+    if (!valueTy.isIntOrFloat())
+      return rewriter.notifyMatchFailure(
+          op, "value type is not a compatible scalar");
+
     Location loc = op.getLoc();
-    rewriter.replaceOp(op, createSubgroupDPPReduction(
-                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    rewriter.replaceOp(op,
+                       createSubgroupDPPReduction(rewriter, loc, op.getValue(),
+                                                  op.getOp(), *ci, chipset));
     return success();
   }
 
 private:
   unsigned subgroupSize = 0;
   bool matchClustered = false;
-  Chipset chipset;
+  amdgpu::Chipset chipset;
 };
 } // namespace
 
@@ -489,7 +500,7 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
 }
 
 void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset,
+    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
     PatternBenefit benefit) {
   patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
                                           /*matchClustered=*/true, chipset,
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 9f2aa1be52fc3..8ac1a5561aad6 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -6,14 +6,20 @@
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-SHFL
 
+// RUN: mlir-opt --allow-unregistered-dialect \
+// RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-DPP
+
 // CHECK-SUB:  gpu.module @kernels {
 // CHECK-SHFL: gpu.module @kernels {
+// CHECK-DPP: gpu.module @kernels {
 gpu.module @kernels {
 
   // CHECK-SUB-LABEL:  gpu.func @kernel0(
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel0(
+  // CHECK-DPP-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
     // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
     // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -36,6 +42,7 @@ gpu.module @kernels {
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}}
     %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
@@ -52,6 +59,8 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel1(
+  //
+  // CHECK-DPP-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
     // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -68,6 +77,8 @@ gpu.module @kernels {
     // Note stride is dropped because it is == 1.
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+    // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror
     %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
@@ -131,6 +142,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3_clustered(
+  // CHECK-DPP-SAME:    %[[ARG0:.+]]: i32)
   gpu.func @kernel3_clustered(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -144,6 +157,14 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
+
+    // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+    // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+    // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+    // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
@@ -246,6 +267,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-DPP-LABEL: gpu.func @kernel5_clustered
+  // CHECK-DPP-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5_clustered(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -257,6 +280,16 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+
+    // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index a49d304baf5c6..7515e9050240d 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -10,10 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
@@ -28,8 +31,9 @@ struct TestGpuRewritePass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass)
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect, func::FuncDialect, index::IndexDialect,
-                    memref::MemRefDialect>();
+    registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect,
+                    func::FuncDialect, index::IndexDialect,
+                    memref::MemRefDialect, ROCDL::ROCDLDialect>();
   }
   StringRef getArgument() const final { return "test-gpu-rewrite"; }
   StringRef getDescription() const final {
@@ -54,7 +58,8 @@ struct TestGpuSubgroupReduceLoweringPass
       : PassWrapper(pass) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect, vector::VectorDialect>();
+    registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
+                    ROCDL::ROCDLDialect, vector::VectorDialect>();
   }
 
   StringRef getArgument() const final {
@@ -70,6 +75,12 @@ struct TestGpuSubgroupReduceLoweringPass
       llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."),
       llvm::cl::init(false)};
 
+  Option<std::string> target{
+      *this, "target",
+      llvm::cl::desc("Target backend name which will be used to provide "
+                     "compatible lowerings of subgroup reduce."),
+      llvm::cl::init("")};
+
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
 
@@ -77,8 +88,13 @@ struct TestGpuSubgroupReduceLoweringPass
     // perform fewer failing matches.
     populateGpuBreakDownSubgroupReducePatterns(patterns,
                                                /*maxShuffleBitwidth=*/32,
-                                               PatternBenefit(2));
+                                               PatternBenefit(3));
     if (expandToShuffles) {
+      auto maybeChipset = amdgpu::Chipset::parse(target);
+      if (!failed(maybeChipset)) {
+        populateGpuLowerSubgroupReduceToDPPPatterns(
+            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+      }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
       populateGpuLowerClusteredSubgroupReduceToShufflePatterns(

>From 3392f082d573676ce2c8b87fe726b285875e4ea0 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 16:28:48 -0500
Subject: [PATCH 15/28] Addressing PR comments

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp        | 12 +++++++-----
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp         |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index a327730851ed4..a01b182501f36 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -376,8 +376,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   const int allRows = 0xf;
   const int allBanks = 0xf;
   const bool boundCtrl = true;
-  Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
-  Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
+  Value lane31 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
+  Value lane63 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(63));
   if (ci.clusterSize >= 2) {
     auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
     dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -417,9 +419,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
       dppResult = b.create<amdgpu::DPPOp>(
           loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
           b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
-    } else if (chipset.majorVersion == 10) {
-      Value uIntMaxConst =
-          b.create<LLVM::ConstantOp>(loc, b.getI32Type(), -1);
+    } else if (chipset.majorVersion >= 10) {
+      Value uIntMaxConst = b.create<arith::ConstantOp>(loc, b.getI32Type(),
+                                                       b.getI32IntegerAttr(-1));
       dppResult = b.create<ROCDL::PermlaneX16Op>(
           loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
           true, false);
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 7515e9050240d..97f9e33290f35 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -91,7 +91,7 @@ struct TestGpuSubgroupReduceLoweringPass
                                                PatternBenefit(3));
     if (expandToShuffles) {
       auto maybeChipset = amdgpu::Chipset::parse(target);
-      if (!failed(maybeChipset)) {
+      if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
             patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
       }

>From b59922ab89a2e20948ff07cbf743571045c134c9 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 17:01:12 -0500
Subject: [PATCH 16/28] removing unnecessary header

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index a01b182501f36..b0803ff050391 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
-#include <llvm-14/llvm/Support/ErrorHandling.h>
 
 using namespace mlir;
 

>From 6431293b3bb3741bd2e461acebb264a5ed29f81b Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 00:27:29 -0500
Subject: [PATCH 17/28] Addressing PR comments

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |  4 +++
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 14 ++++++--
 .../Dialect/GPU/subgroup-reduce-lowering.mlir | 34 ++++++++++++++-----
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp  |  2 ++
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5b185e262deb0..f113649e0c908 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -71,6 +71,10 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                  amdgpu::Chipset chipset,
                                                  PatternBenefit benefit = 1);
 
+void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+    PatternBenefit benefit = 1);
+
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b0803ff050391..0a0dc95b0c0d9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -372,8 +372,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  amdgpu::Chipset chipset) {
   Value dppResult;
   Value result = input;
-  const int allRows = 0xf;
-  const int allBanks = 0xf;
+  constexpr int allRows = 0xf;
+  constexpr int allBanks = 0xf;
   const bool boundCtrl = true;
   Value lane31 =
       b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
@@ -504,10 +504,18 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
     RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
     PatternBenefit benefit) {
   patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/true, chipset,
+                                          /*matchClustered=*/false, chipset,
                                           benefit);
 }
 
+void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+  RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+  PatternBenefit benefit) {
+patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+                                        /*matchClustered=*/true, chipset,
+                                        benefit);
+}
+
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth, PatternBenefit benefit) {
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 8ac1a5561aad6..018ea835ea38c 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -97,6 +97,8 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel2(
+  // CHECK-DPP-LABEL: gpu.func @kernel2(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
     // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -114,6 +116,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel3(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -174,6 +178,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -196,6 +202,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel4(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
+  // CHECK-DPP-LABEL: gpu.func @kernel4(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -232,6 +240,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
+  // CHECK-DPP-LABEL: gpu.func @kernel4_clustered(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -247,6 +257,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-DPP-LABEL: gpu.func @kernel5(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -281,15 +293,15 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
 
-    // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> ()
+    // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -299,6 +311,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel6(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
+  // CHECK-DPP-LABEL: gpu.func @kernel6(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
     // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -322,6 +336,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
+  // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL-COUNT-5: gpu.shuffle xor
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>)
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 97f9e33290f35..f34b882c1be86 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -94,6 +94,8 @@ struct TestGpuSubgroupReduceLoweringPass
       if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
             patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+        populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
       }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);

>From ae25fa0a55e931f1450120c1d5d298f3f0d27d49 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 01:06:47 -0500
Subject: [PATCH 18/28] moving permlanex16 changes to another commit

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  4 +---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  6 ------
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 14 -------------
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 21 ++++++++++---------
 mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir   |  8 -------
 5 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 17c1162170073..108d7237ff703 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,8 +524,7 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>,
-      I32EnumAttrCase<"row_share", 12>
+      I32EnumAttrCase<"row_bcast_31", 11>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -558,7 +557,6 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
-    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4d343c8f3200c..5f697bdeef566 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,7 +1293,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
-      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1351,11 +1350,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
-    case DPPPerm::row_share:
-      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
-        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
-      }
-      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index af4438f028542..549a4376a4a04 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,20 +468,6 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
-
-  case DPPPerm::row_share: {
-    if (!permArgument) {
-      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
-                         "' value not specified");
-    }
-    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
-      uint32_t attrValue = intAttr.getInt();
-      if (attrValue < 0 || attrValue > 15) {
-        return emitOpError(
-            "Attribute value for 'row_share' must be between 0 and 15");
-      }
-    }
-  } break;
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0a0dc95b0c0d9..77201f319164f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -366,7 +366,7 @@ struct VectorSubgroupReduceToShuffles final
   bool matchClustered = false;
 };
 
-Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  gpu::AllReduceOperation mode,
                                  const ClusterInfo &ci,
                                  amdgpu::Chipset chipset) {
@@ -418,12 +418,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
       dppResult = b.create<amdgpu::DPPOp>(
           loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
           b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
-    } else if (chipset.majorVersion >= 10) {
-      Value uIntMaxConst = b.create<arith::ConstantOp>(loc, b.getI32Type(),
-                                                       b.getI32IntegerAttr(-1));
-      dppResult = b.create<ROCDL::PermlaneX16Op>(
-          loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
-          true, false);
+    } else {
+      return std::nullopt;
     }
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
@@ -479,9 +475,14 @@ struct ScalarSubgroupReduceToDPP final
           op, "value type is not a compatible scalar");
 
     Location loc = op.getLoc();
-    rewriter.replaceOp(op,
-                       createSubgroupDPPReduction(rewriter, loc, op.getValue(),
-                                                  op.getOp(), *ci, chipset));
+    std::optional<Value> dpp = createSubgroupDPPReduction(
+        rewriter, loc, op.getValue(), op.getOp(), *ci, chipset);
+    if (!dpp)
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+
+    rewriter.replaceOp(op, *dpp);
     return success();
   }
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 64b3328b70ab4..14691e73e62d7 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,11 +137,3 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
-
-func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
-  // CHECK-LABEL: func @dpp_row_share  
-  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
-  // CHECK: return %0 : i32  
-  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
-  return %0 : i32  
-}

>From 97450983127a0ce7ca43d4e120fe84215225ebbd Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 02:27:29 -0500
Subject: [PATCH 19/28] fixing test

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../Dialect/GPU/subgroup-reduce-lowering.mlir     | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 018ea835ea38c..11db35e31588b 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -32,11 +32,15 @@ gpu.module @kernels {
     // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
     // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
     // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum0) : (vector<5xf16>) -> ()
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum1) : (vector<5xf16>) -> ()
 
@@ -66,11 +70,15 @@ gpu.module @kernels {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
     // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
     // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum0) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum1) : (vector<1xf32>) -> ()
 
@@ -84,6 +92,7 @@ gpu.module @kernels {
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-NOT: amdgpu.dpp
     %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
 
@@ -137,6 +146,9 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
     // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
+    
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
@@ -258,7 +270,6 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   // CHECK-DPP-LABEL: gpu.func @kernel5(
-  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -270,6 +281,8 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
+    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 

>From a6c35b3a88cc22eb5f01447cdd69f5b1c017fd4a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 10:38:42 -0500
Subject: [PATCH 20/28] fixing code formatting

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      | 14 ++++++-----
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 23 ++++++++++---------
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp  | 10 ++++----
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index f113649e0c908..a13ad33df29cd 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -63,6 +63,12 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
 
+/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
+/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
+void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
+
 /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
 /// ops over scalar types. Assumes that the subgroup has
 /// `subgroupSize` lanes. Applicable only to AMD GPUs.
@@ -71,16 +77,12 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                  amdgpu::Chipset chipset,
                                                  PatternBenefit benefit = 1);
 
+/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
+/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
     RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
     PatternBenefit benefit = 1);
 
-/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
-/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
-void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize,
-    unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
-
 /// Collect all patterns to rewrite ops within the GPU dialect.
 inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
   populateGpuAllReducePatterns(patterns);
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 77201f319164f..55176f5b10959 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -10,13 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
@@ -366,10 +366,11 @@ struct VectorSubgroupReduceToShuffles final
   bool matchClustered = false;
 };
 
-std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
-                                 gpu::AllReduceOperation mode,
-                                 const ClusterInfo &ci,
-                                 amdgpu::Chipset chipset) {
+std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
+                                                Value input,
+                                                gpu::AllReduceOperation mode,
+                                                const ClusterInfo &ci,
+                                                amdgpu::Chipset chipset) {
   Value dppResult;
   Value result = input;
   constexpr int allRows = 0xf;
@@ -510,11 +511,11 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
 }
 
 void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-  RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
-  PatternBenefit benefit) {
-patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                        /*matchClustered=*/true, chipset,
-                                        benefit);
+    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+    PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+                                          /*matchClustered=*/true, chipset,
+                                          benefit);
 }
 
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index f34b882c1be86..fe402da4cc105 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -31,9 +31,8 @@ struct TestGpuRewritePass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass)
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect,
-                    func::FuncDialect, index::IndexDialect,
-                    memref::MemRefDialect, ROCDL::ROCDLDialect>();
+    registry.insert<arith::ArithDialect, func::FuncDialect, index::IndexDialect,
+                    memref::MemRefDialect>();
   }
   StringRef getArgument() const final { return "test-gpu-rewrite"; }
   StringRef getDescription() const final {
@@ -58,8 +57,9 @@ struct TestGpuSubgroupReduceLoweringPass
       : PassWrapper(pass) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
-                    ROCDL::ROCDLDialect, vector::VectorDialect>();
+    registry
+        .insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
+                ROCDL::ROCDLDialect, vector::VectorDialect>();
   }
 
   StringRef getArgument() const final {

>From 8a9cefb8bf364a961639e9b209d9a78f658a9d26 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 15:28:57 -0500
Subject: [PATCH 21/28] Updating implementation to support gfx 10+

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 148 +++++++++++-------
 1 file changed, 90 insertions(+), 58 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 55176f5b10959..c1dedd9216a14 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -22,6 +22,7 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
@@ -371,72 +372,103 @@ std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
                                                 gpu::AllReduceOperation mode,
                                                 const ClusterInfo &ci,
                                                 amdgpu::Chipset chipset) {
-  Value dppResult;
   Value result = input;
   constexpr int allRows = 0xf;
   constexpr int allBanks = 0xf;
   const bool boundCtrl = true;
-  Value lane31 =
-      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
-  Value lane63 =
-      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(63));
-  if (ci.clusterSize >= 2) {
-    auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
-    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                        amdgpu::DPPPerm::quad_perm, permArg,
-                                        allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 4) {
-    auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
-    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                        amdgpu::DPPPerm::quad_perm, permArg,
-                                        allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 8) {
-    dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr(), allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 16) {
-    dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
-        b.getUnitAttr(), allRows, allBanks, boundCtrl);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 32) {
-    if (chipset.majorVersion <= 9) {
+  Value lane0 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(0));
+  Value lane32 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(32));
+
+  auto dppReduceAcrossLanes = [&](int numLanes,
+                                  Value res) -> std::optional<Value> {
+    Value dppResult, laneVal;
+
+    switch (numLanes) {
+    case 2:
+      // Perform reduction between all lanes N <-> N+1.
+      dppResult = b.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+          b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+      break;
+    case 4:
+      // Perform reduction between all lanes N <-> N+2.
       dppResult = b.create<amdgpu::DPPOp>(
-          loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-          b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
-    } else {
+          loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+          b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+      break;
+    case 8:
+      // Perform reduction between all lanes N <-> 7-N,
+      // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+      dppResult = b.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+          b.getUnitAttr(), allRows, allBanks, boundCtrl);
+      break;
+    case 16:
+      // Perform reduction between all lanes N <-> 15-N,
+      // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+      dppResult = b.create<amdgpu::DPPOp>(
+          loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+          b.getUnitAttr(), allRows, allBanks, boundCtrl);
+      break;
+    case 32:
+      if (chipset.majorVersion <= 9) {
+        // Broadcast last value from each row to next row.
+        // Use row mask to avoid polluting rows 1 and 3.
+        dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+                                            amdgpu::DPPPerm::row_bcast_15,
+                                            b.getUnitAttr(), 0xa, allBanks,
+                                            /*bound_ctrl*/ false);
+      } else if (chipset.majorVersion <= 12) {
+        // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+        dppResult = b.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                   -1, -1, /*fi=*/true,
+                                                   /*bound_ctrl=*/false);
+        if (ci.subgroupSize == 32) {
+          dppResult =
+              b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+        }
+      } else {
+        return std::nullopt;
+      }
+      break;
+    case 64:
+      if (chipset.majorVersion <= 9) {
+        // Broadcast 31st lane value to rows 2 and 3.
+        // Use row mask to avoid polluting rows 0 and 1.
+        dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+                                            amdgpu::DPPPerm::row_bcast_31,
+                                            b.getUnitAttr(), 0xc, allBanks,
+                                            /*bound_ctrl*/ false);
+      } else if (chipset.majorVersion <= 12) {
+        // Assume reduction across 32 lanes has been done.
+        // Perform final reduction manually by summing values in lane 0 and
+        // lane 32.
+        dppResult =
+            b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+        laneVal = b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+        return vector::makeArithReduction(
+            b, loc, gpu::convertReductionKind(mode), dppResult, laneVal);
+      } else {
+        return std::nullopt;
+      }
+      break;
+    default:
+      // Should never reach here given previous validation of ClusterInfo.
+      llvm_unreachable("ERROR: Unexpected cluster size.");
       return std::nullopt;
     }
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-    if (ci.subgroupSize == 32) {
-      result =
-          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane31);
+    return vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                      res, dppResult);
+  };
+
+  for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) {
+    if (auto dpp = dppReduceAcrossLanes(cs, result)) {
+      result = *dpp;
+      continue;
     }
-  }
-
-  if (ci.clusterSize == 64) {
-    dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+    return std::nullopt;
   }
 
   assert(result.getType() == input.getType());

>From c39520333c5ee1b779c324c51805718fbb8de963 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 15:54:08 -0500
Subject: [PATCH 22/28] Small formatting change

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index c1dedd9216a14..28c569ee2bd83 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -463,7 +463,7 @@ std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
                                       res, dppResult);
   };
 
-  for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) {
+  for (unsigned cs = 2; cs <= ci.clusterSize; cs <<= 1) {
     if (auto dpp = dppReduceAcrossLanes(cs, result)) {
       result = *dpp;
       continue;

>From ab15c44eea3e0d4fff1cb133a059e459b62229a7 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 16:50:03 -0500
Subject: [PATCH 23/28] Removing ReadlaneOps from test

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 11db35e31588b..139edf6882df6 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -33,14 +33,12 @@ gpu.module @kernels {
     // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
     // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum0) : (vector<5xf16>) -> ()
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
     // CHECK-SUB: "test.consume"
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum1) : (vector<5xf16>) -> ()
 
@@ -71,14 +69,12 @@ gpu.module @kernels {
     // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
     // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum0) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
     // CHECK-SUB: "test.consume"
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum1) : (vector<1xf32>) -> ()
 
@@ -148,7 +144,6 @@ gpu.module @kernels {
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
     
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
@@ -282,7 +277,6 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
     // CHECK-DPP-COUNT-6: amdgpu.dpp
-    // CHECK-DPP: rocdl.readlane
     %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 

>From 55f442e488acc1bec1b2ecdd50564a884be87738 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 16:51:09 -0500
Subject: [PATCH 24/28] Improve dpp implementation

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 215 +++++++++---------
 1 file changed, 107 insertions(+), 108 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 28c569ee2bd83..2200754f55938 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -367,112 +367,112 @@ struct VectorSubgroupReduceToShuffles final
   bool matchClustered = false;
 };
 
-std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
-                                                Value input,
-                                                gpu::AllReduceOperation mode,
-                                                const ClusterInfo &ci,
-                                                amdgpu::Chipset chipset) {
-  Value result = input;
+FailureOr<Value>
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
+                           Value input, gpu::AllReduceOperation mode,
+                           const ClusterInfo &ci, amdgpu::Chipset chipset) {
+  Location loc = op.getLoc();
+  Value dpp;
+  Value res = input;
   constexpr int allRows = 0xf;
   constexpr int allBanks = 0xf;
   const bool boundCtrl = true;
-  Value lane0 =
-      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(0));
-  Value lane32 =
-      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(32));
-
-  auto dppReduceAcrossLanes = [&](int numLanes,
-                                  Value res) -> std::optional<Value> {
-    Value dppResult, laneVal;
-
-    switch (numLanes) {
-    case 2:
-      // Perform reduction between all lanes N <-> N+1.
-      dppResult = b.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-          b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
-      break;
-    case 4:
-      // Perform reduction between all lanes N <-> N+2.
-      dppResult = b.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-          b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
-      break;
-    case 8:
-      // Perform reduction between all lanes N <-> 7-N,
-      // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
-      dppResult = b.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
-          b.getUnitAttr(), allRows, allBanks, boundCtrl);
-      break;
-    case 16:
-      // Perform reduction between all lanes N <-> 15-N,
-      // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-      dppResult = b.create<amdgpu::DPPOp>(
-          loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror,
-          b.getUnitAttr(), allRows, allBanks, boundCtrl);
-      break;
-    case 32:
-      if (chipset.majorVersion <= 9) {
-        // Broadcast last value from each row to next row.
-        // Use row mask to avoid polluting rows 1 and 3.
-        dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
-                                            amdgpu::DPPPerm::row_bcast_15,
-                                            b.getUnitAttr(), 0xa, allBanks,
-                                            /*bound_ctrl*/ false);
-      } else if (chipset.majorVersion <= 12) {
-        // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-        dppResult = b.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
-                                                   -1, -1, /*fi=*/true,
-                                                   /*bound_ctrl=*/false);
-        if (ci.subgroupSize == 32) {
-          dppResult =
-              b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-        }
-      } else {
-        return std::nullopt;
-      }
-      break;
-    case 64:
-      if (chipset.majorVersion <= 9) {
-        // Broadcast 31st lane value to rows 2 and 3.
-        // Use row mask to avoid polluting rows 0 and 1.
-        dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
-                                            amdgpu::DPPPerm::row_bcast_31,
-                                            b.getUnitAttr(), 0xc, allBanks,
-                                            /*bound_ctrl*/ false);
-      } else if (chipset.majorVersion <= 12) {
-        // Assume reduction across 32 lanes has been done.
-        // Perform final reduction manually by summing values in lane 0 and
-        // lane 32.
-        dppResult =
-            b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-        laneVal = b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-        return vector::makeArithReduction(
-            b, loc, gpu::convertReductionKind(mode), dppResult, laneVal);
-      } else {
-        return std::nullopt;
+  if (ci.clusterSize >= 2) {
+    // Perform reduction between all lanes N <-> N+1.
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+
+  if (ci.clusterSize >= 4) {
+    // Perform reduction between all lanes N <-> N+2.
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 8) {
+    // Perform reduction between all lanes N <-> 7-N,
+    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 16) {
+    // Perform reduction between all lanes N <-> 15-N,
+    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast last value from each row to next row.
+      // Use row mask to avoid polluting rows 1 and 3.
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+          rewriter.getUnitAttr(), 0xa, allBanks,
+          /*bound_ctrl*/ false);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+    } else if (chipset.majorVersion <= 12) {
+      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+      Value uint32Max = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                  uint32Max, uint32Max,
+                                                  /*fi=*/true,
+                                                  /*bound_ctrl=*/false);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+      if (ci.subgroupSize == 32) {
+        Value lane0 = rewriter.create<arith::ConstantOp>(
+            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+        dpp =
+            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
       }
-      break;
-    default:
-      // Should never reach here given previous validation of ClusterInfo.
-      llvm_unreachable("ERROR: Unexpected cluster size.");
-      return std::nullopt;
+    } else {
+      return rewriter.notifyMatchFailure(
+        op, "Subgroup reduce lowering to DPP not currently supported for "
+            "this device.");
     }
-    return vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                      res, dppResult);
-  };
-
-  for (unsigned cs = 2; cs <= ci.clusterSize; cs <<= 1) {
-    if (auto dpp = dppReduceAcrossLanes(cs, result)) {
-      result = *dpp;
-      continue;
+  }
+  if (ci.clusterSize >= 64) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast 31st lane value to rows 2 and 3.
+      // Use row mask to avoid polluting rows 0 and 1.
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+          rewriter.getUnitAttr(), 0xc, allBanks,
+          /*bound_ctrl*/ false);
+
+    } else if (chipset.majorVersion <= 12) {
+      // Assume reduction across 32 lanes has been done.
+      // Perform final reduction manually by summing values in lane 0 and
+      // lane 32.
+      Value lane0 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+      Value lane32 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+    } else {
+      return rewriter.notifyMatchFailure(
+        op, "Subgroup reduce lowering to DPP not currently supported for "
+            "this device.");
     }
-    return std::nullopt;
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
   }
-
-  assert(result.getType() == input.getType());
-  return result;
+  assert(res.getType() == input.getType());
+  return res;
 }
 
 /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
@@ -500,22 +500,21 @@ struct ScalarSubgroupReduceToDPP final
       return failure();
 
     if (ci->clusterStride != 1)
-      return failure();
+      return rewriter.notifyMatchFailure(
+          op, "Supgroup reductions using DPP are currently only available for "
+              "clusters of contiguous lanes.");
 
     Type valueTy = op.getType();
     if (!valueTy.isIntOrFloat())
       return rewriter.notifyMatchFailure(
           op, "value type is not a compatible scalar");
 
-    Location loc = op.getLoc();
-    std::optional<Value> dpp = createSubgroupDPPReduction(
-        rewriter, loc, op.getValue(), op.getOp(), *ci, chipset);
-    if (!dpp)
-      return rewriter.notifyMatchFailure(
-          op, "Subgroup reduce lowering to DPP not currently supported for "
-              "this device.");
+    FailureOr<Value> dpp = createSubgroupDPPReduction(
+        rewriter, op, op.getValue(), op.getOp(), *ci, chipset);
+    if (failed(dpp))
+      return failure();
 
-    rewriter.replaceOp(op, *dpp);
+    rewriter.replaceOp(op, dpp.value());
     return success();
   }
 

>From 644228894d572de1dc4790a0f614e827c5ca8f9a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 22:07:58 -0500
Subject: [PATCH 25/28] fixing formatting

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../Dialect/GPU/Transforms/SubgroupReduceLowering.cpp  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 2200754f55938..56fae105c4d45 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -425,7 +425,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
     } else if (chipset.majorVersion <= 12) {
       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
       Value uint32Max = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
                                                   uint32Max, uint32Max,
                                                   /*fi=*/true,
@@ -440,8 +440,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
       }
     } else {
       return rewriter.notifyMatchFailure(
-        op, "Subgroup reduce lowering to DPP not currently supported for "
-            "this device.");
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
     }
   }
   if (ci.clusterSize >= 64) {
@@ -465,8 +465,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
     } else {
       return rewriter.notifyMatchFailure(
-        op, "Subgroup reduce lowering to DPP not currently supported for "
-            "this device.");
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
     }
     res = vector::makeArithReduction(rewriter, loc,
                                      gpu::convertReductionKind(mode), res, dpp);

>From 848c6baa2aad527a98317182e83e96bc06eb9b88 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 22 Apr 2025 10:40:47 -0500
Subject: [PATCH 26/28] Fixing implementation of DPP subgroup reduce

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 56fae105c4d45..f2fc9a4e39bcd 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -368,7 +368,7 @@ struct VectorSubgroupReduceToShuffles final
 };
 
 FailureOr<Value>
-createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
                            Value input, gpu::AllReduceOperation mode,
                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
   Location loc = op.getLoc();
@@ -435,7 +435,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
       if (ci.subgroupSize == 32) {
         Value lane0 = rewriter.create<arith::ConstantOp>(
             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
-        dpp =
+        res =
             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
       }
     } else {

>From ad77f7a3c5238784dd8913fcf55360d61134279e Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 22 Apr 2025 22:50:06 -0500
Subject: [PATCH 27/28] [mlir][AMDGPU] Improving DPP implementation of
 subgrroup reduce

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |   4 +-
 .../include/mlir/Dialect/GPU/Utils/GPUUtils.h |   2 +
 .../mlir/Dialect/GPU/Utils/ReductionUtils.h   |  41 +++
 mlir/lib/Dialect/GPU/CMakeLists.txt           |   1 +
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 332 ++++++++++--------
 mlir/lib/Dialect/GPU/Utils/CMakeLists.txt     |   5 +-
 mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp | 170 +++++++++
 mlir/lib/Dialect/GPU/Utils/Utils.cpp          |  26 ++
 mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp  |   4 +-
 9 files changed, 431 insertions(+), 154 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
 create mode 100644 mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index a13ad33df29cd..5c63ad5f32b71 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -74,13 +74,15 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
 /// `subgroupSize` lanes. Applicable only to AMD GPUs.
 void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                  unsigned subgroupSize,
+                                                 unsigned shuffleBitwidth,
                                                  amdgpu::Chipset chipset,
                                                  PatternBenefit benefit = 1);
 
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset,
     PatternBenefit benefit = 1);
 
 /// Collect all patterns to rewrite ops within the GPU dialect.
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
index 073493971e6b7..a55f0e1f09a36 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
@@ -29,6 +29,8 @@ class LaunchOp;
 
 /// Returns the matching vector combining kind.
 vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode);
+/// Returns the matching gpu allreduce mode.
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind);
 } // namespace gpu
 
 /// Get a gpu.func created from outlining the region of a gpu.launch op with the
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
new file mode 100644
index 0000000000000..f766dab8c02df
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -0,0 +1,41 @@
+//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+  unsigned subgroupSize);
+
+FailureOr<Value>
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+                           Value input, gpu::AllReduceOperation mode,
+                           const ClusterInfo &ci, amdgpu::Chipset chipset,
+                           function_ref<Value(Value)> packFn,
+                           function_ref<Value(Value)> unpackFn);
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 013311ec027da..1074760aa959e 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
 
   LINK_LIBS PUBLIC
   MLIRAffineUtils
+  MLIRAMDGPUDialect
   MLIRArithDialect
   MLIRAsyncDialect
   MLIRBufferizationDialect
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index f2fc9a4e39bcd..57af63cbe5eca 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -148,34 +149,34 @@ struct ScalarizeSingleElementReduce final
   }
 };
 
-struct ClusterInfo {
-  unsigned clusterStride;
-  unsigned clusterSize;
-  unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
-  assert(llvm::isPowerOf2_32(subgroupSize));
-
-  std::optional<uint32_t> clusterSize = op.getClusterSize();
-  assert(!clusterSize ||
-         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
-  if (clusterSize && *clusterSize > subgroupSize)
-    return op.emitOpError()
-           << "cluster size " << *clusterSize
-           << " is greater than subgroup size " << subgroupSize;
-  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
-  auto clusterStride = op.getClusterStride();
-  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-  if (clusterStride >= subgroupSize)
-    return op.emitOpError()
-           << "cluster stride " << clusterStride
-           << " is not less than subgroup size " << subgroupSize;
-
-  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
+// struct ClusterInfo {
+//   unsigned clusterStride;
+//   unsigned clusterSize;
+//   unsigned subgroupSize;
+// };
+
+// static FailureOr<ClusterInfo>
+// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+//   assert(llvm::isPowerOf2_32(subgroupSize));
+
+//   std::optional<uint32_t> clusterSize = op.getClusterSize();
+//   assert(!clusterSize ||
+//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+//   if (clusterSize && *clusterSize > subgroupSize)
+//     return op.emitOpError()
+//            << "cluster size " << *clusterSize
+//            << " is greater than subgroup size " << subgroupSize;
+//   unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+//   auto clusterStride = op.getClusterStride();
+//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+//   if (clusterStride >= subgroupSize)
+//     return op.emitOpError()
+//            << "cluster stride " << clusterStride
+//            << " is not less than subgroup size " << subgroupSize;
+
+//   return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+// }
 
 /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
 /// and `unpackFn` to convert to the native shuffle type and to the reduction
@@ -367,113 +368,113 @@ struct VectorSubgroupReduceToShuffles final
   bool matchClustered = false;
 };
 
-FailureOr<Value>
-createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
-                           Value input, gpu::AllReduceOperation mode,
-                           const ClusterInfo &ci, amdgpu::Chipset chipset) {
-  Location loc = op.getLoc();
-  Value dpp;
-  Value res = input;
-  constexpr int allRows = 0xf;
-  constexpr int allBanks = 0xf;
-  const bool boundCtrl = true;
-  if (ci.clusterSize >= 2) {
-    // Perform reduction between all lanes N <-> N+1.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-
-  if (ci.clusterSize >= 4) {
-    // Perform reduction between all lanes N <-> N+2.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 8) {
-    // Perform reduction between all lanes N <-> 7-N,
-    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
-        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 16) {
-    // Perform reduction between all lanes N <-> 15-N,
-    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
-        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 32) {
-    if (chipset.majorVersion <= 9) {
-      // Broadcast last value from each row to next row.
-      // Use row mask to avoid polluting rows 1 and 3.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
-          rewriter.getUnitAttr(), 0xa, allBanks,
-          /*bound_ctrl*/ false);
-      res = vector::makeArithReduction(
-          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
-    } else if (chipset.majorVersion <= 12) {
-      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-      Value uint32Max = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
-                                                  uint32Max, uint32Max,
-                                                  /*fi=*/true,
-                                                  /*bound_ctrl=*/false);
-      res = vector::makeArithReduction(
-          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
-      if (ci.subgroupSize == 32) {
-        Value lane0 = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
-        res =
-            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-      }
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "Subgroup reduce lowering to DPP not currently supported for "
-              "this device.");
-    }
-  }
-  if (ci.clusterSize >= 64) {
-    if (chipset.majorVersion <= 9) {
-      // Broadcast 31st lane value to rows 2 and 3.
-      // Use row mask to avoid polluting rows 0 and 1.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
-          rewriter.getUnitAttr(), 0xc, allBanks,
-          /*bound_ctrl*/ false);
-
-    } else if (chipset.majorVersion <= 12) {
-      // Assume reduction across 32 lanes has been done.
-      // Perform final reduction manually by summing values in lane 0 and
-      // lane 32.
-      Value lane0 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
-      Value lane32 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
-      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "Subgroup reduce lowering to DPP not currently supported for "
-              "this device.");
-    }
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  assert(res.getType() == input.getType());
-  return res;
-}
+// FailureOr<Value>
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+//                            Value input, gpu::AllReduceOperation mode,
+//                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
+//   Location loc = op.getLoc();
+//   Value dpp;
+//   Value res = input;
+//   constexpr int allRows = 0xf;
+//   constexpr int allBanks = 0xf;
+//   const bool boundCtrl = true;
+//   if (ci.clusterSize >= 2) {
+//     // Perform reduction between all lanes N <-> N+1.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+
+//   if (ci.clusterSize >= 4) {
+//     // Perform reduction between all lanes N <-> N+2.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 8) {
+//     // Perform reduction between all lanes N <-> 7-N,
+//     // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 16) {
+//     // Perform reduction between all lanes N <-> 15-N,
+//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 32) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast last value from each row to next row.
+//       // Use row mask to avoid polluting rows 1 and 3.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+//           rewriter.getUnitAttr(), 0xa, allBanks,
+//           /*bound_ctrl*/ false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//     } else if (chipset.majorVersion <= 12) {
+//       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+//       Value uint32Max = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+//                                                   uint32Max, uint32Max,
+//                                                   /*fi=*/true,
+//                                                   /*bound_ctrl=*/false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//       if (ci.subgroupSize == 32) {
+//         Value lane0 = rewriter.create<arith::ConstantOp>(
+//             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//         res =
+//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//       }
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//   }
+//   if (ci.clusterSize >= 64) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast 31st lane value to rows 2 and 3.
+//       // Use row mask to avoid polluting rows 0 and 1.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+//           rewriter.getUnitAttr(), 0xc, allBanks,
+//           /*bound_ctrl*/ false);
+
+//     } else if (chipset.majorVersion <= 12) {
+//       // Assume reduction across 32 lanes has been done.
+//       // Perform final reduction manually by summing values in lane 0 and
+//       // lane 32.
+//       Value lane0 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//       Value lane32 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+//       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   assert(res.getType() == input.getType());
+//   return res;
+// }
 
 /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
 /// ops over scalar types. Assumes that the subgroup has
@@ -481,9 +482,9 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
 struct ScalarSubgroupReduceToDPP final
     : OpRewritePattern<gpu::SubgroupReduceOp> {
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
-                            bool matchClustered, amdgpu::Chipset chipset,
-                            PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+                            unsigned shuffleBitwidth, bool matchClustered,
+                            amdgpu::Chipset chipset, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
         matchClustered(matchClustered), chipset(chipset) {}
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
@@ -505,12 +506,42 @@ struct ScalarSubgroupReduceToDPP final
               "clusters of contiguous lanes.");
 
     Type valueTy = op.getType();
-    if (!valueTy.isIntOrFloat())
+    unsigned elemBitwidth =
+        getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth();
+    if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth)
       return rewriter.notifyMatchFailure(
           op, "value type is not a compatible scalar");
 
+    Location loc = op.getLoc();
+    // Since this is already a native shuffle scalar, no packing is necessary.
+    if (elemBitwidth == shuffleBitwidth) {
+      auto identityFn = [](Value v) { return v; };
+      FailureOr<Value> dpp =
+          createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(),
+                                     *ci, chipset, identityFn, identityFn);
+      if (failed(dpp))
+        return failure();
+      rewriter.replaceOp(op, dpp.value());
+      return success();
+    }
+
+    auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
+    auto equivIntType = rewriter.getIntegerType(elemBitwidth);
+    auto packFn = [loc, &rewriter, equivIntType,
+                   shuffleIntType](Value unpackedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::BitcastOp>(loc, equivIntType, unpackedVal);
+      return rewriter.create<arith::ExtUIOp>(loc, shuffleIntType, asInt);
+    };
+    auto unpackFn = [loc, &rewriter, equivIntType,
+                     valueTy](Value packedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::TruncIOp>(loc, equivIntType, packedVal);
+      return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
+    };
+
     FailureOr<Value> dpp = createSubgroupDPPReduction(
-        rewriter, op, op.getValue(), op.getOp(), *ci, chipset);
+        rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
     if (failed(dpp))
       return failure();
 
@@ -520,6 +551,7 @@ struct ScalarSubgroupReduceToDPP final
 
 private:
   unsigned subgroupSize = 0;
+  unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
   amdgpu::Chipset chipset;
 };
@@ -534,19 +566,19 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
 }
 
 void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
-    PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/false, chipset,
-                                          benefit);
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/false, chipset, benefit);
 }
 
 void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
-    PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/true, chipset,
-                                          benefit);
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/true, chipset, benefit);
 }
 
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
index 69094c518a159..e7489eaac4988 100644
--- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
@@ -1,14 +1,17 @@
 add_mlir_dialect_library(MLIRGPUUtils
   Utils.cpp
   DistributionUtils.cpp
+  ReductionUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
 
   LINK_LIBS PUBLIC
-  MLIRArithDialect
   MLIRAffineDialect
+  MLIRArithDialect
+  MLIRAMDGPUDialect
   MLIRGPUDialect
+  MLIRROCDLDialect
   MLIRSupport
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
new file mode 100644
index 0000000000000..255c4152bd5a4
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -0,0 +1,170 @@
+//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements distribution utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+
+#include <numeric>
+
+using namespace mlir;
+
+FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+                                                 unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+FailureOr<Value> mlir::createSubgroupDPPReduction(
+    PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input,
+    gpu::AllReduceOperation mode, const ClusterInfo &ci,
+    amdgpu::Chipset chipset, function_ref<Value(Value)> packFn,
+    function_ref<Value(Value)> unpackFn) {
+  Location loc = op.getLoc();
+  Value dpp;
+  Value res = input;
+  constexpr int allRows = 0xf;
+  constexpr int allBanks = 0xf;
+  const bool boundCtrl = true;
+  if (ci.clusterSize >= 2) {
+    // Perform reduction between all lanes N <-> N+1.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+
+  if (ci.clusterSize >= 4) {
+    // Perform reduction between all lanes N <-> N+2.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 8) {
+    // Perform reduction between all lanes N <-> 7-N,
+    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 16) {
+    // Perform reduction between all lanes N <-> 15-N,
+    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast last value from each row to next row.
+      // Use row mask to avoid polluting rows 1 and 3.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+          rewriter.getUnitAttr(), 0xa, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+    } else if (chipset.majorVersion <= 12) {
+      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+      Value uint32Max = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      res = packFn(res);
+      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                  uint32Max, uint32Max,
+                                                  /*fi=*/true,
+                                                  /*bound_ctrl=*/false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+      if (ci.subgroupSize == 32) {
+        Value lane0 = rewriter.create<arith::ConstantOp>(
+            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+        res =
+            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+      }
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+  }
+  if (ci.clusterSize >= 64) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast 31st lane value to rows 2 and 3.
+      // Use row mask to avoid polluting rows 0 and 1.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+          rewriter.getUnitAttr(), 0xc, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+
+    } else if (chipset.majorVersion <= 12) {
+      // Assume reduction across 32 lanes has been done.
+      // Perform final reduction manually by summing values in lane 0 and
+      // lane 32.
+      Value lane0 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+      Value lane32 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  assert(res.getType() == input.getType());
+  return res;
+}
\ No newline at end of file
diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
index 1f09875b3e273..53b1e0883055c 100644
--- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
@@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) {
   llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
 }
 
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) {
+  switch (kind) {
+#define MAP_CASE(X)                                                            \
+  case vector::CombiningKind::X:                                               \
+    return gpu::AllReduceOperation::X
+
+    MAP_CASE(ADD);
+    MAP_CASE(MUL);
+    MAP_CASE(MINUI);
+    MAP_CASE(MINSI);
+    MAP_CASE(MINNUMF);
+    MAP_CASE(MAXSI);
+    MAP_CASE(MAXUI);
+    MAP_CASE(MAXNUMF);
+    MAP_CASE(AND);
+    MAP_CASE(OR);
+    MAP_CASE(XOR);
+    MAP_CASE(MINIMUMF);
+    MAP_CASE(MAXIMUMF);
+
+#undef MAP_CASE
+  }
+
+  llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
+}
+
 } // namespace mlir::gpu
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index fe402da4cc105..4ebcf897fd532 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -93,9 +93,9 @@ struct TestGpuSubgroupReduceLoweringPass
       auto maybeChipset = amdgpu::Chipset::parse(target);
       if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
         populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
       }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);

>From 8315e1eb7e9aa57db6e3e9a1f58b5fa2c62fe0cf Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 22 Apr 2025 22:56:23 -0500
Subject: [PATCH 28/28] Formatting fix

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
index 255c4152bd5a4..2f50a1ec87cba 100644
--- a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -51,6 +51,7 @@ FailureOr<Value> mlir::createSubgroupDPPReduction(
     gpu::AllReduceOperation mode, const ClusterInfo &ci,
     amdgpu::Chipset chipset, function_ref<Value(Value)> packFn,
     function_ref<Value(Value)> unpackFn) {
+  
   Location loc = op.getLoc();
   Value dpp;
   Value res = input;
@@ -167,4 +168,4 @@ FailureOr<Value> mlir::createSubgroupDPPReduction(
   }
   assert(res.getType() == input.getType());
   return res;
-}
\ No newline at end of file
+}