[Mlir-commits] [mlir] [AMDGPU] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)

Fri Apr 11 09:28:06 PDT 2025

https://github.com/Muzammiluddin-Syed-ECE updated https://github.com/llvm/llvm-project/pull/133204

>From 4a66ccb794c68b0480a1766f70705ad509827bdb Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 25 Mar 2025 14:04:06 -0500
Subject: [PATCH 1/7] Creates AMDToGPUPass to house a subgroup reduce lowering
 pattern to DPP ops.

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h |  32 ++++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |  16 ++
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt |  22 +++
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    | 176 ++++++++++++++++++
 mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt |   1 +
 7 files changed, 249 insertions(+)
 create mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
 create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
 create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp

diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
new file mode 100644
index 0000000000000..2d3bb384235ca
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
@@ -0,0 +1,32 @@
+//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+
+
+#include "mlir/IR/PatternMatch.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+
+class LLVMTypeConverter;
+class RewritePatternSet;
+class TypeConverter;
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+                                            unsigned subgroupSize,
+                                            PatternBenefit benefit);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index ccd862f67c068..1189423799092 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,6 +34,7 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..b28b4900e6814 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// GPUToAMDGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
+  let summary = "Generate AMDGPU operations for gpu operations";
+  let dependentDialects = [
+    "amdgpu::AMDGPUDialect",
+    "LLVM::LLVMDialect",
+    "ROCDL::ROCDLDialect",
+  ];
+  let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
+                        /*default=*/"64",
+                        "Size of subgroup">];
+}
+
 //===----------------------------------------------------------------------===//
 // ConvertIndexToLLVMPass
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b6c21440c571c..b957a4473f1e6 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
 add_subdirectory(GPUCommon)
+add_subdirectory(GPUToAMDGPU)
 add_subdirectory(GPUToLLVMSPV)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
new file mode 100644
index 0000000000000..9b82b5dc63d9c
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_mlir_conversion_library(MLIRGPUToAMDGPU
+  GPUToAMDGPU.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
+  
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRGPUDialect
+  MLIRAMDGPUDialect
+  MLIRAMDGPUUtils
+  MLIRROCDLDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
new file mode 100644
index 0000000000000..bab83c12157a9
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -0,0 +1,176 @@
+//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci) {
+  Value result = input;
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 32) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 10, 15, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize == 64) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), 12, 15, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  auto int32Type = IntegerType::get(b.getContext(), 32);
+  Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+  result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+  assert(result.getType() == input.getType());
+  return result;
+}
+
+struct ScalarSubgroupReduceToShuffles final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+                                 bool matchClustered, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        matchClustered(matchClustered) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+
+    Location loc = op.getLoc();
+    rewriter.replaceOp(op, createSubgroupDPPReduction(
+                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  bool matchClustered = false;
+};
+
+struct ConvertGPUToAMDGPUPass
+    : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    int subgroupSizeInt = static_cast<int>(subgroupSize);
+    populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
+                                           PatternBenefit(1));
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+} // namespace
+
+void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+                                                  unsigned subgroupSize,
+                                                  PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToShuffles>(
+      patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
+}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 945e3ccdfa87b..52484ac69a3e2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
   MLIRMathToLLVM
   MLIRMathToROCDL
   MLIRAMDGPUToROCDL
+  MLIRGPUToAMDGPU
   MLIRFuncToLLVM
   MLIRGPUDialect
   MLIRGPUToGPURuntimeTransforms

>From ae9883c4b41c9b23e3601f3dea3286738420ac8d Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 2 Apr 2025 17:48:56 -0500
Subject: [PATCH 2/7] Fix for numerical issues in MatVec tests

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index bab83c12157a9..b07ed0a7c636a 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -82,26 +82,31 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 8) {
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr());
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shr, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 16) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+                                amdgpu::DPPPerm::row_shr, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+
   if (ci.clusterSize >= 32) {
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 10, 15, false);
+        b.getUnitAttr(), 0xa, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -110,7 +115,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), 12, 15, false);
+        b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }

>From df55f2001906f4e76892ae15dcdaace23821905e Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:08:59 -0500
Subject: [PATCH 3/7] Rewrites pattern to be closer to device lib impl.

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Dialect/GPU/Transforms/Passes.h      |   7 ++
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    |  27 ++---
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 109 ++++++++++++++++++
 3 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5cc65082a7e56..41e0759e958b5 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -62,6 +62,13 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
+                                                 unsigned subgroupSize,
+                                                 PatternBenefit benefit = 1);
+
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index b07ed0a7c636a..590fa7d9b4ffc 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -67,7 +67,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -76,39 +76,41 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 8) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 16) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-
+  auto int32Type = IntegerType::get(b.getContext(), 32);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
         b.getUnitAttr(), 0xa, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
+    result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);    
+    }
   }
 
   if (ci.clusterSize == 64) {
@@ -118,11 +120,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
-  auto int32Type = IntegerType::get(b.getContext(), 32);
-  Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
-  result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   assert(result.getType() == input.getType());
   return result;
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 43eff3eddcc49..f07ef6cf154a9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -12,6 +12,8 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -362,6 +364,106 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci) {
+  Value result = input;
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+  auto int32Type = IntegerType::get(b.getContext(), 32);
+  if (ci.clusterSize >= 32) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 0xa, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result =
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+    }
+  }
+
+  if (ci.clusterSize == 64) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), allRows, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+  }
+
+  assert(result.getType() == input.getType());
+  return result;
+}
+
+struct ScalarSubgroupReduceToDPP final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
+                            bool matchClustered, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        matchClustered(matchClustered) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+    Location loc = op.getLoc();
+    rewriter.replaceOp(op, createSubgroupDPPReduction(
+                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  bool matchClustered = false;
+};
 } // namespace
 
 void mlir::populateGpuBreakDownSubgroupReducePatterns(
@@ -372,6 +474,13 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
   patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
 }
 
+void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+                                          /*matchClustered=*/true, benefit);
+}
+
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth, PatternBenefit benefit) {

>From 5cb56322b21e00e400f82c3972e1ec32c08c99bb Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:15:36 -0500
Subject: [PATCH 4/7] Removes AMDToGPUPass, moving pattern into existing pass

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h |  32 ---
 mlir/include/mlir/Conversion/Passes.h         |   1 -
 mlir/include/mlir/Conversion/Passes.td        |  16 --
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt |  22 ---
 .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp    | 182 ------------------
 mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt |   1 -
 7 files changed, 255 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
 delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
 delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp

diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
deleted file mode 100644
index 2d3bb384235ca..0000000000000
--- a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-
-
-#include "mlir/IR/PatternMatch.h"
-#include <memory>
-#include <string>
-
-namespace mlir {
-
-class LLVMTypeConverter;
-class RewritePatternSet;
-class TypeConverter;
-class Pass;
-
-#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-
-void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
-                                            unsigned subgroupSize,
-                                            PatternBenefit benefit);
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 1189423799092..ccd862f67c068 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,7 +34,6 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index b28b4900e6814..bbba495e613b2 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,22 +643,6 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// GPUToAMDGPU
-//===----------------------------------------------------------------------===//
-
-def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
-  let summary = "Generate AMDGPU operations for gpu operations";
-  let dependentDialects = [
-    "amdgpu::AMDGPUDialect",
-    "LLVM::LLVMDialect",
-    "ROCDL::ROCDLDialect",
-  ];
-  let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
-                        /*default=*/"64",
-                        "Size of subgroup">];
-}
-
 //===----------------------------------------------------------------------===//
 // ConvertIndexToLLVMPass
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b957a4473f1e6..b6c21440c571c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,7 +24,6 @@ add_subdirectory(FuncToEmitC)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
 add_subdirectory(GPUCommon)
-add_subdirectory(GPUToAMDGPU)
 add_subdirectory(GPUToLLVMSPV)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
deleted file mode 100644
index 9b82b5dc63d9c..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-add_mlir_conversion_library(MLIRGPUToAMDGPU
-  GPUToAMDGPU.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
-  
-  DEPENDS
-  MLIRConversionPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRLLVMCommonConversion
-  MLIRLLVMDialect
-  MLIRGPUDialect
-  MLIRAMDGPUDialect
-  MLIRAMDGPUUtils
-  MLIRROCDLDialect
-  MLIRPass
-  MLIRTransforms
-  )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
deleted file mode 100644
index 590fa7d9b4ffc..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Pass/Pass.h"
-
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/Support/FormatVariadic.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-
-namespace {
-struct ClusterInfo {
-  unsigned clusterStride;
-  unsigned clusterSize;
-  unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
-  assert(llvm::isPowerOf2_32(subgroupSize));
-
-  std::optional<uint32_t> clusterSize = op.getClusterSize();
-  assert(!clusterSize ||
-         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
-  if (clusterSize && *clusterSize > subgroupSize)
-    return op.emitOpError()
-           << "cluster size " << *clusterSize
-           << " is greater than subgroup size " << subgroupSize;
-  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
-  auto clusterStride = op.getClusterStride();
-  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-  if (clusterStride >= subgroupSize)
-    return op.emitOpError()
-           << "cluster stride " << clusterStride
-           << " is not less than subgroup size " << subgroupSize;
-
-  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
-
-Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
-                                 gpu::AllReduceOperation mode,
-                                 const ClusterInfo &ci) {
-  Value result = input;
-  if (ci.clusterSize >= 2) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 4) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shl, permArg);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 8) {
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
-        b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  if (ci.clusterSize >= 16) {
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-  }
-
-  const int allRows = 0xf;
-  const int allBanks = 0xf;
-  auto int32Type = IntegerType::get(b.getContext(), 32);
-  if (ci.clusterSize >= 32) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-    result, dppResult);
-    if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
-      result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);    
-    }
-  }
-
-  if (ci.clusterSize == 64) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
-        b.getUnitAttr(), allRows, allBanks, false);
-    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
-    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
-  }
-
-  assert(result.getType() == input.getType());
-  return result;
-}
-
-struct ScalarSubgroupReduceToShuffles final
-    : OpRewritePattern<gpu::SubgroupReduceOp> {
-  ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
-                                 bool matchClustered, PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
-        matchClustered(matchClustered) {}
-
-  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
-                                PatternRewriter &rewriter) const override {
-    if (op.getClusterSize().has_value() != matchClustered) {
-      return rewriter.notifyMatchFailure(
-          op, llvm::formatv("op is {0}clustered but pattern is configured to "
-                            "only match {1}clustered ops",
-                            matchClustered ? "non-" : "",
-                            matchClustered ? "" : "non-"));
-    }
-
-    auto ci = getAndValidateClusterInfo(op, subgroupSize);
-    if (failed(ci))
-      return failure();
-
-    Location loc = op.getLoc();
-    rewriter.replaceOp(op, createSubgroupDPPReduction(
-                               rewriter, loc, op.getValue(), op.getOp(), *ci));
-    return success();
-  }
-
-private:
-  unsigned subgroupSize = 0;
-  bool matchClustered = false;
-};
-
-struct ConvertGPUToAMDGPUPass
-    : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    int subgroupSizeInt = static_cast<int>(subgroupSize);
-    populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
-                                           PatternBenefit(1));
-    walkAndApplyPatterns(getOperation(), std::move(patterns));
-  }
-};
-} // namespace
-
-void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
-                                                  unsigned subgroupSize,
-                                                  PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToShuffles>(
-      patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
-}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 52484ac69a3e2..945e3ccdfa87b 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,7 +15,6 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
   MLIRMathToLLVM
   MLIRMathToROCDL
   MLIRAMDGPUToROCDL
-  MLIRGPUToAMDGPU
   MLIRFuncToLLVM
   MLIRGPUDialect
   MLIRGPUToGPURuntimeTransforms

>From 25eed299795f9970cec5d891396002811f1e6493 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 10 Apr 2025 14:06:51 -0500
Subject: [PATCH 5/7] Adding permlanex16 and other dpp related ops to mlir
 dialect

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  4 +++-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  | 16 ++++++++++++++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  6 ++++++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 14 +++++++++++++
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 21 +++++++++++--------
 mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir   |  8 +++++++
 6 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 9cdd961d96ff5..d2e69da1ba09f 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>
+      I32EnumAttrCase<"row_bcast_31", 11>,
+      I32EnumAttrCase<"row_share", 12>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
+    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 8945466f5ef5b..907df07d7e954 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
   }];
 }
 
+// PermLaneX16 intrinsic operation
+def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
+    [AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
+    [4, 5], ["fi", "boundControl"]>,
+  Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
+             I1Attr:$fi, I1Attr:$boundControl)> {
+  let results = (outs LLVM_Type:$res);
+  let assemblyFormat = [{
+    attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
+  }];
+  let description = [{
+    Performs a `permlanex16` operation with the given operands, applying the
+    permutation specified by $fi to the provided inputs.
+  }];
+}
+
 def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
                         BuildableType<"::mlir::VectorType::get("
                           "{2},$_builder.getI16Type())">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 56d40d6d123bf..65581a73fb267 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1244,6 +1244,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
+      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1301,6 +1302,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
+    case DPPPerm::row_share:
+      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
+        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
+      }
+      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 1e482515a4ee0..45de5c8b910ec 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -457,6 +457,20 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
+
+  case DPPPerm::row_share: {
+    if (!permArgument) {
+      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
+                         "' value not specified");
+    }
+    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
+      uint32_t attrValue = intAttr.getInt();
+      if (attrValue < 0 || attrValue > 15) {
+        return emitOpError(
+            "Attribute value for 'row_share' must be between 0 and 15");
+      }
+    }
+  } break;
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index f07ef6cf154a9..3e64681ad2dd2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  const ClusterInfo &ci) {
   Value result = input;
   if (ci.clusterSize >= 2) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    auto permArg = b.getI32IntegerAttr(1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 4) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    auto permArg = b.getI32IntegerAttr(2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-  auto int32Type = IntegerType::get(b.getContext(), 32);
+  auto uint32Type = b.getIntegerType(32, false);
   if (ci.clusterSize >= 32) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
+    // auto permArg = b.getI32IntegerAttr(15);
+    // Value dppResult = b.create<amdgpu::DPPOp>(
+    //     loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+    //     b.getUnitAttr(), 0xa, allBanks, false);
+    auto uIntMax = llvm::APInt::getMaxValue(32u);
+    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+    Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
       result =
           b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
     }
@@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
     result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 14691e73e62d7..64b3328b70ab4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
+
+func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
+  // CHECK-LABEL: func @dpp_row_share  
+  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
+  // CHECK: return %0 : i32  
+  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
+  return %0 : i32  
+}

>From 13e55b4ecda61643186392002b3d9d260a43d97c Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 10:30:10 -0500
Subject: [PATCH 6/7] Fixing permlanex16 intrinsic failure

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td               | 4 ++--
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 907df07d7e954..f7eeb4f47bf2a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -670,13 +670,13 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
 
 // PermLaneX16 intrinsic operation
 def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
-    [AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
+    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
     [4, 5], ["fi", "boundControl"]>,
   Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
              I1Attr:$fi, I1Attr:$boundControl)> {
   let results = (outs LLVM_Type:$res);
   let assemblyFormat = [{
-    attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
+    attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0) `,` type($src1)
   }];
   let description = [{
     Performs a `permlanex16` operation with the given operands, applying the
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 3e64681ad2dd2..b6bd67fa0ce53 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -405,14 +405,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-  auto uint32Type = b.getIntegerType(32, false);
   if (ci.clusterSize >= 32) {
-    // auto permArg = b.getI32IntegerAttr(15);
-    // Value dppResult = b.create<amdgpu::DPPOp>(
-    //     loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-    //     b.getUnitAttr(), 0xa, allBanks, false);
     auto uIntMax = llvm::APInt::getMaxValue(32u);
-    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
     Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);

>From fb55f7d44f7421082d8a41f3906e51f08215423a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 11:27:53 -0500
Subject: [PATCH 7/7] simplify verbose typing

Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b6bd67fa0ce53..b9eae59584e94 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -419,7 +419,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize == 64) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    auto permArg = b.getI32IntegerAttr(31);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
         b.getUnitAttr(), allRows, allBanks, false);