[Mlir-commits] [mlir] [mlir][AMDGPU] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 16 14:50:24 PDT 2025
https://github.com/Muzammiluddin-Syed-ECE updated https://github.com/llvm/llvm-project/pull/133204
>From 8608b39cd6eb2f11b6c47e1c585eccdc1e9a3a55 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Tue, 25 Mar 2025 14:04:06 -0500
Subject: [PATCH 01/23] Creates AMDToGPUPass to house a subgroup reduce
lowering pattern to DPP ops.
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h | 32 ++++
mlir/include/mlir/Conversion/Passes.h | 1 +
mlir/include/mlir/Conversion/Passes.td | 16 ++
mlir/lib/Conversion/CMakeLists.txt | 1 +
.../lib/Conversion/GPUToAMDGPU/CMakeLists.txt | 22 +++
.../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 176 ++++++++++++++++++
mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt | 1 +
7 files changed, 249 insertions(+)
create mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
new file mode 100644
index 0000000000000..2d3bb384235ca
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
@@ -0,0 +1,32 @@
+//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+
+
+#include "mlir/IR/PatternMatch.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+
+class LLVMTypeConverter;
+class RewritePatternSet;
+class TypeConverter;
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+ unsigned subgroupSize,
+ PatternBenefit benefit);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index ccd862f67c068..1189423799092 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,6 +34,7 @@
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
#include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..b28b4900e6814 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
];
}
+//===----------------------------------------------------------------------===//
+// GPUToAMDGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
+ let summary = "Generate AMDGPU operations for gpu operations";
+ let dependentDialects = [
+ "amdgpu::AMDGPUDialect",
+ "LLVM::LLVMDialect",
+ "ROCDL::ROCDLDialect",
+ ];
+ let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
+ /*default=*/"64",
+ "Size of subgroup">];
+}
+
//===----------------------------------------------------------------------===//
// ConvertIndexToLLVMPass
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b6c21440c571c..b957a4473f1e6 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC)
add_subdirectory(FuncToLLVM)
add_subdirectory(FuncToSPIRV)
add_subdirectory(GPUCommon)
+add_subdirectory(GPUToAMDGPU)
add_subdirectory(GPUToLLVMSPV)
add_subdirectory(GPUToNVVM)
add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
new file mode 100644
index 0000000000000..9b82b5dc63d9c
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_mlir_conversion_library(MLIRGPUToAMDGPU
+ GPUToAMDGPU.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
+
+ DEPENDS
+ MLIRConversionPassIncGen
+
+ LINK_COMPONENTS
+ Core
+
+ LINK_LIBS PUBLIC
+ MLIRLLVMCommonConversion
+ MLIRLLVMDialect
+ MLIRGPUDialect
+ MLIRAMDGPUDialect
+ MLIRAMDGPUUtils
+ MLIRROCDLDialect
+ MLIRPass
+ MLIRTransforms
+ )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
new file mode 100644
index 0000000000000..bab83c12157a9
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -0,0 +1,176 @@
+//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ClusterInfo {
+ unsigned clusterStride;
+ unsigned clusterSize;
+ unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+ assert(llvm::isPowerOf2_32(subgroupSize));
+
+ std::optional<uint32_t> clusterSize = op.getClusterSize();
+ assert(!clusterSize ||
+ llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+ if (clusterSize && *clusterSize > subgroupSize)
+ return op.emitOpError()
+ << "cluster size " << *clusterSize
+ << " is greater than subgroup size " << subgroupSize;
+ unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+ auto clusterStride = op.getClusterStride();
+ assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+ if (clusterStride >= subgroupSize)
+ return op.emitOpError()
+ << "cluster stride " << clusterStride
+ << " is not less than subgroup size " << subgroupSize;
+
+ return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci) {
+ Value result = input;
+ if (ci.clusterSize >= 2) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 4) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 8) {
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 16) {
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 32) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 10, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize == 64) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 12, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ auto int32Type = IntegerType::get(b.getContext(), 32);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+ assert(result.getType() == input.getType());
+ return result;
+}
+
+struct ScalarSubgroupReduceToShuffles final
+ : OpRewritePattern<gpu::SubgroupReduceOp> {
+ ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+ bool matchClustered, PatternBenefit benefit)
+ : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+ matchClustered(matchClustered) {}
+
+ LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+ PatternRewriter &rewriter) const override {
+ if (op.getClusterSize().has_value() != matchClustered) {
+ return rewriter.notifyMatchFailure(
+ op, llvm::formatv("op is {0}clustered but pattern is configured to "
+ "only match {1}clustered ops",
+ matchClustered ? "non-" : "",
+ matchClustered ? "" : "non-"));
+ }
+
+ auto ci = getAndValidateClusterInfo(op, subgroupSize);
+ if (failed(ci))
+ return failure();
+
+ Location loc = op.getLoc();
+ rewriter.replaceOp(op, createSubgroupDPPReduction(
+ rewriter, loc, op.getValue(), op.getOp(), *ci));
+ return success();
+ }
+
+private:
+ unsigned subgroupSize = 0;
+ bool matchClustered = false;
+};
+
+struct ConvertGPUToAMDGPUPass
+ : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
+ using Base::Base;
+
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ int subgroupSizeInt = static_cast<int>(subgroupSize);
+ populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
+ PatternBenefit(1));
+ walkAndApplyPatterns(getOperation(), std::move(patterns));
+ }
+};
+} // namespace
+
+void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
+ unsigned subgroupSize,
+ PatternBenefit benefit) {
+ patterns.add<ScalarSubgroupReduceToShuffles>(
+ patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
+}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 945e3ccdfa87b..52484ac69a3e2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
MLIRMathToLLVM
MLIRMathToROCDL
MLIRAMDGPUToROCDL
+ MLIRGPUToAMDGPU
MLIRFuncToLLVM
MLIRGPUDialect
MLIRGPUToGPURuntimeTransforms
>From 69944c5e09572883a43c76e9c1c2a9d46bedb08a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 2 Apr 2025 17:48:56 -0500
Subject: [PATCH 02/23] Fix for numerical issues in MatVec tests
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index bab83c12157a9..b07ed0a7c636a 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -82,26 +82,31 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
}
if (ci.clusterSize >= 8) {
- Value dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
- b.getUnitAttr());
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 16) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ amdgpu::DPPPerm::row_shr, permArg);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
+ const int allRows = 0xf;
+ const int allBanks = 0xf;
+
if (ci.clusterSize >= 32) {
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
Value dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 10, 15, false);
+ b.getUnitAttr(), 0xa, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
@@ -110,7 +115,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
Value dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), 12, 15, false);
+ b.getUnitAttr(), allRows, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
>From 905810167fe218ca4bbc32564ffafb89edb78e76 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:08:59 -0500
Subject: [PATCH 03/23] Rewrites pattern to be closer to device lib impl.
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Dialect/GPU/Transforms/Passes.h | 7 ++
.../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 27 ++---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 109 ++++++++++++++++++
3 files changed, 130 insertions(+), 13 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5cc65082a7e56..41e0759e958b5 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -62,6 +62,13 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
+ unsigned subgroupSize,
+ PatternBenefit benefit = 1);
+
/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
index b07ed0a7c636a..590fa7d9b4ffc 100644
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -67,7 +67,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shr, permArg);
+ amdgpu::DPPPerm::row_shl, permArg);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
@@ -76,39 +76,41 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shr, permArg);
+ amdgpu::DPPPerm::row_shl, permArg);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 8) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
- Value dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shr, permArg);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr());
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 16) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shr, permArg);
+ amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
const int allRows = 0xf;
const int allBanks = 0xf;
-
+ auto int32Type = IntegerType::get(b.getContext(), 32);
if (ci.clusterSize >= 32) {
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
Value dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
b.getUnitAttr(), 0xa, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
+ result, dppResult);
+ if (ci.subgroupSize == 32) {
+ Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+ }
}
if (ci.clusterSize == 64) {
@@ -118,11 +120,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
b.getUnitAttr(), allRows, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
}
- auto int32Type = IntegerType::get(b.getContext(), 32);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
assert(result.getType() == input.getType());
return result;
}
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 43eff3eddcc49..f07ef6cf154a9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -12,6 +12,8 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -362,6 +364,106 @@ struct VectorSubgroupReduceToShuffles final
unsigned shuffleBitwidth = 0;
bool matchClustered = false;
};
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci) {
+ Value result = input;
+ if (ci.clusterSize >= 2) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shl, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 4) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shl, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 8) {
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 16) {
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ const int allRows = 0xf;
+ const int allBanks = 0xf;
+ auto int32Type = IntegerType::get(b.getContext(), 32);
+ if (ci.clusterSize >= 32) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 0xa, allBanks, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ if (ci.subgroupSize == 32) {
+ Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+ result =
+ b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+ }
+ }
+
+ if (ci.clusterSize == 64) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), allRows, allBanks, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+ }
+
+ assert(result.getType() == input.getType());
+ return result;
+}
+
+struct ScalarSubgroupReduceToDPP final
+ : OpRewritePattern<gpu::SubgroupReduceOp> {
+ ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
+ bool matchClustered, PatternBenefit benefit)
+ : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+ matchClustered(matchClustered) {}
+
+ LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+ PatternRewriter &rewriter) const override {
+ if (op.getClusterSize().has_value() != matchClustered) {
+ return rewriter.notifyMatchFailure(
+ op, llvm::formatv("op is {0}clustered but pattern is configured to "
+ "only match {1}clustered ops",
+ matchClustered ? "non-" : "",
+ matchClustered ? "" : "non-"));
+ }
+ auto ci = getAndValidateClusterInfo(op, subgroupSize);
+ if (failed(ci))
+ return failure();
+ Location loc = op.getLoc();
+ rewriter.replaceOp(op, createSubgroupDPPReduction(
+ rewriter, loc, op.getValue(), op.getOp(), *ci));
+ return success();
+ }
+
+private:
+ unsigned subgroupSize = 0;
+ bool matchClustered = false;
+};
} // namespace
void mlir::populateGpuBreakDownSubgroupReducePatterns(
@@ -372,6 +474,13 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
}
+void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
+ RewritePatternSet &patterns, unsigned subgroupSize,
+ PatternBenefit benefit) {
+ patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+ /*matchClustered=*/true, benefit);
+}
+
void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth, PatternBenefit benefit) {
>From 8a4267207082f3448ebbe393710bcd5aa24910bd Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 3 Apr 2025 15:15:36 -0500
Subject: [PATCH 04/23] Removes AMDToGPUPass, moving pattern into existing pass
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h | 32 ---
mlir/include/mlir/Conversion/Passes.h | 1 -
mlir/include/mlir/Conversion/Passes.td | 16 --
mlir/lib/Conversion/CMakeLists.txt | 1 -
.../lib/Conversion/GPUToAMDGPU/CMakeLists.txt | 22 ---
.../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 182 ------------------
mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt | 1 -
7 files changed, 255 deletions(-)
delete mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
deleted file mode 100644
index 2d3bb384235ca..0000000000000
--- a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
-
-
-#include "mlir/IR/PatternMatch.h"
-#include <memory>
-#include <string>
-
-namespace mlir {
-
-class LLVMTypeConverter;
-class RewritePatternSet;
-class TypeConverter;
-class Pass;
-
-#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-
-void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
- unsigned subgroupSize,
- PatternBenefit benefit);
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 1189423799092..ccd862f67c068 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,7 +34,6 @@
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
#include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index b28b4900e6814..bbba495e613b2 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,22 +643,6 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
];
}
-//===----------------------------------------------------------------------===//
-// GPUToAMDGPU
-//===----------------------------------------------------------------------===//
-
-def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
- let summary = "Generate AMDGPU operations for gpu operations";
- let dependentDialects = [
- "amdgpu::AMDGPUDialect",
- "LLVM::LLVMDialect",
- "ROCDL::ROCDLDialect",
- ];
- let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
- /*default=*/"64",
- "Size of subgroup">];
-}
-
//===----------------------------------------------------------------------===//
// ConvertIndexToLLVMPass
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b957a4473f1e6..b6c21440c571c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,7 +24,6 @@ add_subdirectory(FuncToEmitC)
add_subdirectory(FuncToLLVM)
add_subdirectory(FuncToSPIRV)
add_subdirectory(GPUCommon)
-add_subdirectory(GPUToAMDGPU)
add_subdirectory(GPUToLLVMSPV)
add_subdirectory(GPUToNVVM)
add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
deleted file mode 100644
index 9b82b5dc63d9c..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-add_mlir_conversion_library(MLIRGPUToAMDGPU
- GPUToAMDGPU.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
-
- DEPENDS
- MLIRConversionPassIncGen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRLLVMCommonConversion
- MLIRLLVMDialect
- MLIRGPUDialect
- MLIRAMDGPUDialect
- MLIRAMDGPUUtils
- MLIRROCDLDialect
- MLIRPass
- MLIRTransforms
- )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
deleted file mode 100644
index 590fa7d9b4ffc..0000000000000
--- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Pass/Pass.h"
-
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/Support/FormatVariadic.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-
-namespace {
-struct ClusterInfo {
- unsigned clusterStride;
- unsigned clusterSize;
- unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
- assert(llvm::isPowerOf2_32(subgroupSize));
-
- std::optional<uint32_t> clusterSize = op.getClusterSize();
- assert(!clusterSize ||
- llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
- if (clusterSize && *clusterSize > subgroupSize)
- return op.emitOpError()
- << "cluster size " << *clusterSize
- << " is greater than subgroup size " << subgroupSize;
- unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
- auto clusterStride = op.getClusterStride();
- assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
- if (clusterStride >= subgroupSize)
- return op.emitOpError()
- << "cluster stride " << clusterStride
- << " is not less than subgroup size " << subgroupSize;
-
- return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
-
-Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
- gpu::AllReduceOperation mode,
- const ClusterInfo &ci) {
- Value result = input;
- if (ci.clusterSize >= 2) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
- Value dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 4) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
- Value dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 8) {
- Value dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
- b.getUnitAttr());
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 16) {
- Value dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- const int allRows = 0xf;
- const int allBanks = 0xf;
- auto int32Type = IntegerType::get(b.getContext(), 32);
- if (ci.clusterSize >= 32) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
- Value dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 0xa, allBanks, false);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- if (ci.subgroupSize == 32) {
- Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
- }
- }
-
- if (ci.clusterSize == 64) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
- Value dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), allRows, allBanks, false);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
- }
-
- assert(result.getType() == input.getType());
- return result;
-}
-
-struct ScalarSubgroupReduceToShuffles final
- : OpRewritePattern<gpu::SubgroupReduceOp> {
- ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
- bool matchClustered, PatternBenefit benefit)
- : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
- matchClustered(matchClustered) {}
-
- LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
- PatternRewriter &rewriter) const override {
- if (op.getClusterSize().has_value() != matchClustered) {
- return rewriter.notifyMatchFailure(
- op, llvm::formatv("op is {0}clustered but pattern is configured to "
- "only match {1}clustered ops",
- matchClustered ? "non-" : "",
- matchClustered ? "" : "non-"));
- }
-
- auto ci = getAndValidateClusterInfo(op, subgroupSize);
- if (failed(ci))
- return failure();
-
- Location loc = op.getLoc();
- rewriter.replaceOp(op, createSubgroupDPPReduction(
- rewriter, loc, op.getValue(), op.getOp(), *ci));
- return success();
- }
-
-private:
- unsigned subgroupSize = 0;
- bool matchClustered = false;
-};
-
-struct ConvertGPUToAMDGPUPass
- : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
- using Base::Base;
-
- void runOnOperation() override {
- RewritePatternSet patterns(&getContext());
- int subgroupSizeInt = static_cast<int>(subgroupSize);
- populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
- PatternBenefit(1));
- walkAndApplyPatterns(getOperation(), std::move(patterns));
- }
-};
-} // namespace
-
-void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
- unsigned subgroupSize,
- PatternBenefit benefit) {
- patterns.add<ScalarSubgroupReduceToShuffles>(
- patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
-}
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 52484ac69a3e2..945e3ccdfa87b 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,7 +15,6 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
MLIRMathToLLVM
MLIRMathToROCDL
MLIRAMDGPUToROCDL
- MLIRGPUToAMDGPU
MLIRFuncToLLVM
MLIRGPUDialect
MLIRGPUToGPURuntimeTransforms
>From ba4afaf5c271641e68d3e199465359dbbde03dfc Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Thu, 10 Apr 2025 14:06:51 -0500
Subject: [PATCH 05/23] Adding permlanex16 and other dpp related ops to mlir
dialect
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 +++-
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 16 ++++++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ++++++
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 14 +++++++++++++
.../GPU/Transforms/SubgroupReduceLowering.cpp | 21 +++++++++++--------
mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir | 8 +++++++
6 files changed, 59 insertions(+), 10 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 108d7237ff703..17c1162170073 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
I32EnumAttrCase<"row_mirror", 8>,
I32EnumAttrCase<"row_half_mirror", 9>,
I32EnumAttrCase<"row_bcast_15", 10>,
- I32EnumAttrCase<"row_bcast_31", 11>
+ I32EnumAttrCase<"row_bcast_31", 11>,
+ I32EnumAttrCase<"row_share", 12>
]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
- Reverse within a half-row (`row_half_mirror`)
- Broadcast the 15th lane of each row to the next row (`row_bcast`)
- Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
+ - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
}];
let results = (outs AnyType:$result);
let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 900155c274b4d..8dcca4903f214 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
}];
}
+// PermLaneX16 intrinsic operation
+def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
+ [AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
+ [4, 5], ["fi", "boundControl"]>,
+ Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
+ I1Attr:$fi, I1Attr:$boundControl)> {
+ let results = (outs LLVM_Type:$res);
+ let assemblyFormat = [{
+ attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
+ }];
+ let description = [{
+ Performs a `permlanex16` operation with the given operands, applying the
+ permutation specified by $fi to the provided inputs.
+ }];
+}
+
def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
BuildableType<"::mlir::VectorType::get("
"{2},$_builder.getI16Type())">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5f697bdeef566..4d343c8f3200c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,6 +1293,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
ROW_HALF_MIRROR = 0x141,
BCAST15 = 0x142,
BCAST31 = 0x143,
+ ROW_SHARE0 = 0x150
};
auto kind = DppOp.getKind();
@@ -1350,6 +1351,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
case DPPPerm::row_bcast_31:
DppCtrl = DppCtrl::BCAST31;
break;
+ case DPPPerm::row_share:
+ if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
+ DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
+ }
+ break;
}
// Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 549a4376a4a04..af4438f028542 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,6 +468,20 @@ LogicalResult DPPOp::verify() {
}
break;
}
+
+ case DPPPerm::row_share: {
+ if (!permArgument) {
+ return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
+ "' value not specified");
+ }
+ if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
+ uint32_t attrValue = intAttr.getInt();
+ if (attrValue < 0 || attrValue > 15) {
+ return emitOpError(
+ "Attribute value for 'row_share' must be between 0 and 15");
+ }
+ }
+ } break;
}
return success();
}
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index f07ef6cf154a9..3e64681ad2dd2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
const ClusterInfo &ci) {
Value result = input;
if (ci.clusterSize >= 2) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ auto permArg = b.getI32IntegerAttr(1);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
amdgpu::DPPPerm::row_shl, permArg);
@@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
}
if (ci.clusterSize >= 4) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ auto permArg = b.getI32IntegerAttr(2);
Value dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
amdgpu::DPPPerm::row_shl, permArg);
@@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
const int allRows = 0xf;
const int allBanks = 0xf;
- auto int32Type = IntegerType::get(b.getContext(), 32);
+ auto uint32Type = b.getIntegerType(32, false);
if (ci.clusterSize >= 32) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
- Value dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 0xa, allBanks, false);
+ // auto permArg = b.getI32IntegerAttr(15);
+ // Value dppResult = b.create<amdgpu::DPPOp>(
+ // loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ // b.getUnitAttr(), 0xa, allBanks, false);
+ auto uIntMax = llvm::APInt::getMaxValue(32u);
+ Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+ Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
if (ci.subgroupSize == 32) {
- Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+ Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
result =
b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
}
@@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
b.getUnitAttr(), allRows, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 14691e73e62d7..64b3328b70ab4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
%0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
return %0 : f16
}
+
+func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {
+ // CHECK-LABEL: func @dpp_row_share
+ // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32
+ // CHECK: return %0 : i32
+ %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32
+ return %0 : i32
+}
>From 1f6fcddc75b48f92a801ea6f02a8ac21fd9ef40a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 10:30:10 -0500
Subject: [PATCH 06/23] Fixing permlanex16 intrinsic failure
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 4 ++--
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +------
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 8dcca4903f214..186a4f53f93cb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -670,13 +670,13 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
// PermLaneX16 intrinsic operation
def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
- [AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
+ [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
[4, 5], ["fi", "boundControl"]>,
Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
I1Attr:$fi, I1Attr:$boundControl)> {
let results = (outs LLVM_Type:$res);
let assemblyFormat = [{
- attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
+ attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0) `,` type($src1)
}];
let description = [{
Performs a `permlanex16` operation with the given operands, applying the
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 3e64681ad2dd2..b6bd67fa0ce53 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -405,14 +405,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
const int allRows = 0xf;
const int allBanks = 0xf;
- auto uint32Type = b.getIntegerType(32, false);
if (ci.clusterSize >= 32) {
- // auto permArg = b.getI32IntegerAttr(15);
- // Value dppResult = b.create<amdgpu::DPPOp>(
- // loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- // b.getUnitAttr(), 0xa, allBanks, false);
auto uIntMax = llvm::APInt::getMaxValue(32u);
- Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+ Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
>From a0aaa97f1c1c2a396d586662a5fa5c2623aa735b Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 11:27:53 -0500
Subject: [PATCH 07/23] simplify verbose typing
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b6bd67fa0ce53..b9eae59584e94 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -419,7 +419,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
}
if (ci.clusterSize == 64) {
- auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ auto permArg = b.getI32IntegerAttr(31);
Value dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
b.getUnitAttr(), allRows, allBanks, false);
>From 4cdd676ea8a1e8f0cadf901b70bfb6f8a59d64df Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 22:13:11 -0500
Subject: [PATCH 08/23] testing numerics
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 59 ++++++++++++-------
1 file changed, 38 insertions(+), 21 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b9eae59584e94..0790edc15921e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -369,46 +369,63 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
gpu::AllReduceOperation mode,
const ClusterInfo &ci) {
Value result = input;
+ Value dppResult;
+ const int allRows = 0xf;
+ const int allBanks = 0xf;
+ const bool boundCtrl = true;
if (ci.clusterSize >= 2) {
auto permArg = b.getI32IntegerAttr(1);
- Value dppResult =
+ dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg);
+ amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 4) {
auto permArg = b.getI32IntegerAttr(2);
- Value dppResult =
+ dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg);
+ amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
- if (ci.clusterSize >= 8) {
- Value dppResult = b.create<amdgpu::DPPOp>(
+ if (ci.clusterSize <= 8) {
+ dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
- b.getUnitAttr());
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ } else if (ci.clusterSize == 8) {
+ auto permArg = b.getI32IntegerAttr(4);
+ dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
}
-
- if (ci.clusterSize >= 16) {
- Value dppResult =
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+
+ if (ci.clusterSize <= 16) {
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ } else if (ci.clusterSize == 16) {
+ auto permArg = b.getI32IntegerAttr(8);
+ dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
+ amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
}
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
- const int allRows = 0xf;
- const int allBanks = 0xf;
if (ci.clusterSize >= 32) {
- auto uIntMax = llvm::APInt::getMaxValue(32u);
- Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
- Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
+ auto permArg = b.getI32IntegerAttr(15);
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 0xa, allBanks, false);
+ // if (chipset.majorVersion == 9)
+ // auto uIntMax = llvm::APInt::getMaxValue(32u);
+ // Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
+ // Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
if (ci.subgroupSize == 32) {
@@ -420,7 +437,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
if (ci.clusterSize == 64) {
auto permArg = b.getI32IntegerAttr(31);
- Value dppResult = b.create<amdgpu::DPPOp>(
+ dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
b.getUnitAttr(), allRows, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
>From fe78aa3bf08e747f06d75c75c715d34aa7ba01fe Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 22:44:39 -0500
Subject: [PATCH 09/23] fixing
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 32 +++++++++++--------
1 file changed, 18 insertions(+), 14 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0790edc15921e..b47553e41c501 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -391,31 +391,35 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
}
- if (ci.clusterSize <= 8) {
+ if (ci.clusterSize == 8) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
b.getUnitAttr(), allRows, allBanks, boundCtrl);
- } else if (ci.clusterSize == 8) {
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ } else if (ci.clusterSize >= 8) {
auto permArg = b.getI32IntegerAttr(4);
- dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+ dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shl, permArg,
+ allRows, allBanks, boundCtrl);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
}
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- if (ci.clusterSize <= 16) {
+ if (ci.clusterSize == 16) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
b.getUnitAttr(), allRows, allBanks, boundCtrl);
- } else if (ci.clusterSize == 16) {
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ } else if (ci.clusterSize >= 16) {
auto permArg = b.getI32IntegerAttr(8);
- dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
- }
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shl, permArg,
+ allRows, allBanks, boundCtrl);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
+ }
if (ci.clusterSize >= 32) {
auto permArg = b.getI32IntegerAttr(15);
>From 3e3500d7706dec67503b90c9f6e060d5ccb54e75 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 23:16:43 -0500
Subject: [PATCH 10/23] fixing
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b47553e41c501..889c378ab0a9f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
-
+ Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
if (ci.clusterSize >= 32) {
auto permArg = b.getI32IntegerAttr(15);
dppResult = b.create<amdgpu::DPPOp>(
@@ -433,7 +433,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
if (ci.subgroupSize == 32) {
- Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
result =
b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
}
@@ -446,8 +445,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
b.getUnitAttr(), allRows, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+ // Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
}
assert(result.getType() == input.getType());
>From 761965775b1253fca6f44240f5403b35d89f5415 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Fri, 11 Apr 2025 23:32:43 -0500
Subject: [PATCH 11/23] fixing
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 889c378ab0a9f..d774197dc6d15 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
- Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
+ Value lane00 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 0);
if (ci.clusterSize >= 32) {
auto permArg = b.getI32IntegerAttr(15);
dppResult = b.create<amdgpu::DPPOp>(
@@ -434,7 +434,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
if (ci.subgroupSize == 32) {
result =
- b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+ b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
}
}
@@ -446,7 +446,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
// Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
}
assert(result.getType() == input.getType());
>From f9db46735959c454e069697153fcdc26aa35d862 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Sun, 13 Apr 2025 22:17:55 -0500
Subject: [PATCH 12/23] trying again
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 67 ++++++++++++-------
1 file changed, 42 insertions(+), 25 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index d774197dc6d15..8dd637b28d4ae 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -374,53 +374,71 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
const int allBanks = 0xf;
const bool boundCtrl = true;
if (ci.clusterSize >= 2) {
- auto permArg = b.getI32IntegerAttr(1);
+ // auto permArg = b.getI32IntegerAttr(1);
+ auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+ amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 4) {
- auto permArg = b.getI32IntegerAttr(2);
+ // auto permArg = b.getI32IntegerAttr(2);
+ auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl);
+ amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
- if (ci.clusterSize == 8) {
+ // if (ci.clusterSize == 8) {
+ // dppResult = b.create<amdgpu::DPPOp>(
+ // loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ // b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ // result, dppResult);
+ // } else if (ci.clusterSize >= 8) {
+ // auto permArg = b.getI32IntegerAttr(4);
+ // dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ // amdgpu::DPPPerm::row_shr, permArg,
+ // allRows, allBanks, boundCtrl);
+ // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ // result, dppResult);
+ // }
+ if (ci.clusterSize >= 8) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
b.getUnitAttr(), allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
- } else if (ci.clusterSize >= 8) {
- auto permArg = b.getI32IntegerAttr(4);
- dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg,
- allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
}
- if (ci.clusterSize == 16) {
+ // if (ci.clusterSize == 16) {
+ // dppResult = b.create<amdgpu::DPPOp>(
+ // loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
+ // b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ // result, dppResult);
+ // } else if (ci.clusterSize >= 16) {
+ // auto permArg = b.getI32IntegerAttr(8);
+ // dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ // amdgpu::DPPPerm::row_shr, permArg,
+ // allRows, allBanks, boundCtrl);
+ // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ // result, dppResult);
+ // }
+ if (ci.clusterSize >= 16) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
b.getUnitAttr(), allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
- } else if (ci.clusterSize >= 16) {
- auto permArg = b.getI32IntegerAttr(8);
- dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_shl, permArg,
- allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
}
- Value lane00 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 0);
+
+ Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
if (ci.clusterSize >= 32) {
auto permArg = b.getI32IntegerAttr(15);
dppResult = b.create<amdgpu::DPPOp>(
@@ -434,7 +452,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
if (ci.subgroupSize == 32) {
result =
- b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
+ b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane31);
}
}
@@ -442,11 +460,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
auto permArg = b.getI32IntegerAttr(31);
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), allRows, allBanks, false);
+ b.getUnitAttr(), 0xc, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
- // Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane00);
+ result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
}
assert(result.getType() == input.getType());
>From f9978e002e76ce682e771438b24949782e64f2ae Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 00:59:52 -0500
Subject: [PATCH 13/23] Fixing implementation
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 72 +++++++------------
1 file changed, 26 insertions(+), 46 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 8dd637b28d4ae..0c923828093b9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -367,14 +368,14 @@ struct VectorSubgroupReduceToShuffles final
Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
gpu::AllReduceOperation mode,
- const ClusterInfo &ci) {
+ const ClusterInfo &ci,
+ amdgpu::Chipset chipset) {
Value result = input;
Value dppResult;
const int allRows = 0xf;
const int allBanks = 0xf;
const bool boundCtrl = true;
if (ci.clusterSize >= 2) {
- // auto permArg = b.getI32IntegerAttr(1);
auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -384,7 +385,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
}
if (ci.clusterSize >= 4) {
- // auto permArg = b.getI32IntegerAttr(2);
auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
dppResult =
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -393,20 +393,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
}
- // if (ci.clusterSize == 8) {
- // dppResult = b.create<amdgpu::DPPOp>(
- // loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
- // b.getUnitAttr(), allRows, allBanks, boundCtrl);
- // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- // result, dppResult);
- // } else if (ci.clusterSize >= 8) {
- // auto permArg = b.getI32IntegerAttr(4);
- // dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- // amdgpu::DPPPerm::row_shr, permArg,
- // allRows, allBanks, boundCtrl);
- // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- // result, dppResult);
- // }
if (ci.clusterSize >= 8) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
@@ -415,20 +401,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
}
- // if (ci.clusterSize == 16) {
- // dppResult = b.create<amdgpu::DPPOp>(
- // loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
- // b.getUnitAttr(), allRows, allBanks, boundCtrl);
- // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- // result, dppResult);
- // } else if (ci.clusterSize >= 16) {
- // auto permArg = b.getI32IntegerAttr(8);
- // dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- // amdgpu::DPPPerm::row_shr, permArg,
- // allRows, allBanks, boundCtrl);
- // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- // result, dppResult);
- // }
if (ci.clusterSize >= 16) {
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
@@ -440,14 +412,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
if (ci.clusterSize >= 32) {
- auto permArg = b.getI32IntegerAttr(15);
- dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 0xa, allBanks, false);
- // if (chipset.majorVersion == 9)
- // auto uIntMax = llvm::APInt::getMaxValue(32u);
- // Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
- // Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
+ if (chipset.majorVersion <= 9) {
+ auto permArg = b.getI32IntegerAttr(15);
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 0xa, allBanks, false);
+ } else if (chipset.majorVersion == 10) {
+ auto uIntMax = llvm::APInt::getMaxValue(32u);
+ Value uIntMaxConst =
+ b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
+ Value dppResult = b.create<ROCDL::PermlaneX16Op>(
+ loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
+ true, false);
+ }
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
if (ci.subgroupSize == 32) {
@@ -458,9 +435,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
if (ci.clusterSize == 64) {
auto permArg = b.getI32IntegerAttr(31);
- dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), 0xc, allBanks, false);
+ dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 0xc, allBanks, false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
@@ -473,9 +450,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
struct ScalarSubgroupReduceToDPP final
: OpRewritePattern<gpu::SubgroupReduceOp> {
ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
- bool matchClustered, PatternBenefit benefit)
+ bool matchClustered, Chipset chipset,
+ PatternBenefit benefit)
: OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
- matchClustered(matchClustered) {}
+ matchClustered(matchClustered), chipset(chipset) {}
LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
PatternRewriter &rewriter) const override {
@@ -498,6 +476,7 @@ struct ScalarSubgroupReduceToDPP final
private:
unsigned subgroupSize = 0;
bool matchClustered = false;
+ Chipset chipset;
};
} // namespace
@@ -510,10 +489,11 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
}
void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
- RewritePatternSet &patterns, unsigned subgroupSize,
+ RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset,
PatternBenefit benefit) {
patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
- /*matchClustered=*/true, benefit);
+ /*matchClustered=*/true, chipset,
+ benefit);
}
void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
>From 70786ead39d7d7f60d9f22357b3cda1f2e4083ba Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 16:11:33 -0500
Subject: [PATCH 14/23] Adding DPP test
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Dialect/GPU/Transforms/Passes.h | 2 +
.../GPU/Transforms/SubgroupReduceLowering.cpp | 57 +++++++++++--------
.../Dialect/GPU/subgroup-reduce-lowering.mlir | 33 +++++++++++
mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 24 ++++++--
4 files changed, 89 insertions(+), 27 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 41e0759e958b5..5b185e262deb0 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -13,6 +13,7 @@
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/PatternMatch.h"
@@ -67,6 +68,7 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
/// `subgroupSize` lanes. Applicable only to AMD GPUs.
void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
unsigned subgroupSize,
+ amdgpu::Chipset chipset,
PatternBenefit benefit = 1);
/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0c923828093b9..a327730851ed4 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -26,6 +26,7 @@
#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
+#include <llvm-14/llvm/Support/ErrorHandling.h>
using namespace mlir;
@@ -370,25 +371,27 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
gpu::AllReduceOperation mode,
const ClusterInfo &ci,
amdgpu::Chipset chipset) {
- Value result = input;
Value dppResult;
+ Value result = input;
const int allRows = 0xf;
const int allBanks = 0xf;
const bool boundCtrl = true;
+ Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
+ Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
if (ci.clusterSize >= 2) {
auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
- dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
+ dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::quad_perm, permArg,
+ allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
if (ci.clusterSize >= 4) {
auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
- dppResult =
- b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl);
+ dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::quad_perm, permArg,
+ allRows, allBanks, boundCtrl);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
}
@@ -409,19 +412,15 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
result, dppResult);
}
- Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
if (ci.clusterSize >= 32) {
if (chipset.majorVersion <= 9) {
- auto permArg = b.getI32IntegerAttr(15);
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 0xa, allBanks, false);
+ b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
} else if (chipset.majorVersion == 10) {
- auto uIntMax = llvm::APInt::getMaxValue(32u);
Value uIntMaxConst =
- b.create<LLVM::ConstantOp>(loc, b.getI32Type(), uIntMax);
- Value dppResult = b.create<ROCDL::PermlaneX16Op>(
+ b.create<LLVM::ConstantOp>(loc, b.getI32Type(), -1);
+ dppResult = b.create<ROCDL::PermlaneX16Op>(
loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
true, false);
}
@@ -434,10 +433,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
}
if (ci.clusterSize == 64) {
- auto permArg = b.getI32IntegerAttr(31);
- dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), 0xc, allBanks, false);
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false);
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
@@ -447,10 +445,13 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
return result;
}
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
struct ScalarSubgroupReduceToDPP final
: OpRewritePattern<gpu::SubgroupReduceOp> {
ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
- bool matchClustered, Chipset chipset,
+ bool matchClustered, amdgpu::Chipset chipset,
PatternBenefit benefit)
: OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
matchClustered(matchClustered), chipset(chipset) {}
@@ -467,16 +468,26 @@ struct ScalarSubgroupReduceToDPP final
auto ci = getAndValidateClusterInfo(op, subgroupSize);
if (failed(ci))
return failure();
+
+ if (ci->clusterStride != 1)
+ return failure();
+
+ Type valueTy = op.getType();
+ if (!valueTy.isIntOrFloat())
+ return rewriter.notifyMatchFailure(
+ op, "value type is not a compatible scalar");
+
Location loc = op.getLoc();
- rewriter.replaceOp(op, createSubgroupDPPReduction(
- rewriter, loc, op.getValue(), op.getOp(), *ci));
+ rewriter.replaceOp(op,
+ createSubgroupDPPReduction(rewriter, loc, op.getValue(),
+ op.getOp(), *ci, chipset));
return success();
}
private:
unsigned subgroupSize = 0;
bool matchClustered = false;
- Chipset chipset;
+ amdgpu::Chipset chipset;
};
} // namespace
@@ -489,7 +500,7 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
}
void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
- RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset,
+ RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit) {
patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
/*matchClustered=*/true, chipset,
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 9f2aa1be52fc3..8ac1a5561aad6 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -6,14 +6,20 @@
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \
// RUN: | FileCheck %s --check-prefix=CHECK-SHFL
+// RUN: mlir-opt --allow-unregistered-dialect \
+// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
+// RUN: | FileCheck %s --check-prefix=CHECK-DPP
+
// CHECK-SUB: gpu.module @kernels {
// CHECK-SHFL: gpu.module @kernels {
+// CHECK-DPP: gpu.module @kernels {
gpu.module @kernels {
// CHECK-SUB-LABEL: gpu.func @kernel0(
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
//
// CHECK-SHFL-LABEL: gpu.func @kernel0(
+ // CHECK-DPP-LABEL: gpu.func @kernel0(
gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
// CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
// CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -36,6 +42,7 @@ gpu.module @kernels {
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
// CHECK-SUB: "test.consume"
+ // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}}
%sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum2) : (vector<5xf16>) -> ()
@@ -52,6 +59,8 @@ gpu.module @kernels {
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
//
// CHECK-SHFL-LABEL: gpu.func @kernel1(
+ //
+ // CHECK-DPP-LABEL: gpu.func @kernel1(
gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
// CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -68,6 +77,8 @@ gpu.module @kernels {
// Note stride is dropped because it is == 1.
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
// CHECK-SUB: "test.consume"
+ // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+ // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror
%sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum2) : (vector<1xf32>) -> ()
@@ -131,6 +142,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
+ // CHECK-DPP-LABEL: gpu.func @kernel3_clustered(
+ // CHECK-DPP-SAME: %[[ARG0:.+]]: i32)
gpu.func @kernel3_clustered(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -144,6 +157,14 @@ gpu.module @kernels {
// CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
// CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
// CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
+
+ // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+ // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+ // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+ // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+ // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
+ // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+ // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> ()
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
"test.consume"(%sum0) : (i32) -> ()
@@ -246,6 +267,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
+ // CHECK-DPP-LABEL: gpu.func @kernel5_clustered
+ // CHECK-DPP-SAME: %[[ARG0:.+]]: i16)
gpu.func @kernel5_clustered(%arg0: i16) kernel {
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -257,6 +280,16 @@ gpu.module @kernels {
// CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+
+ // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+ // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16
+ // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+ // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16
+ // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16
+ // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16
+ // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> ()
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
"test.consume"(%sum0) : (i16) -> ()
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index a49d304baf5c6..7515e9050240d 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -10,10 +10,13 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/PatternMatch.h"
@@ -28,8 +31,9 @@ struct TestGpuRewritePass
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass)
void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<arith::ArithDialect, func::FuncDialect, index::IndexDialect,
- memref::MemRefDialect>();
+ registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect,
+ func::FuncDialect, index::IndexDialect,
+ memref::MemRefDialect, ROCDL::ROCDLDialect>();
}
StringRef getArgument() const final { return "test-gpu-rewrite"; }
StringRef getDescription() const final {
@@ -54,7 +58,8 @@ struct TestGpuSubgroupReduceLoweringPass
: PassWrapper(pass) {}
void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<arith::ArithDialect, vector::VectorDialect>();
+ registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
+ ROCDL::ROCDLDialect, vector::VectorDialect>();
}
StringRef getArgument() const final {
@@ -70,6 +75,12 @@ struct TestGpuSubgroupReduceLoweringPass
llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."),
llvm::cl::init(false)};
+ Option<std::string> target{
+ *this, "target",
+ llvm::cl::desc("Target backend name which will be used to provide "
+ "compatible lowerings of subgroup reduce."),
+ llvm::cl::init("")};
+
void runOnOperation() override {
RewritePatternSet patterns(&getContext());
@@ -77,8 +88,13 @@ struct TestGpuSubgroupReduceLoweringPass
// perform fewer failing matches.
populateGpuBreakDownSubgroupReducePatterns(patterns,
/*maxShuffleBitwidth=*/32,
- PatternBenefit(2));
+ PatternBenefit(3));
if (expandToShuffles) {
+ auto maybeChipset = amdgpu::Chipset::parse(target);
+ if (!failed(maybeChipset)) {
+ populateGpuLowerSubgroupReduceToDPPPatterns(
+ patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+ }
populateGpuLowerSubgroupReduceToShufflePatterns(
patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
>From 4db2e38aa78a740bb006201a9fda47736ebc4fe8 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 16:28:48 -0500
Subject: [PATCH 15/23] Addressing PR comments
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 12 +++++++-----
mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 2 +-
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index a327730851ed4..a01b182501f36 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -376,8 +376,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
const int allRows = 0xf;
const int allBanks = 0xf;
const bool boundCtrl = true;
- Value lane31 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 31);
- Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
+ Value lane31 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
+ Value lane63 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(63));
if (ci.clusterSize >= 2) {
auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
@@ -417,9 +419,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
- } else if (chipset.majorVersion == 10) {
- Value uIntMaxConst =
- b.create<LLVM::ConstantOp>(loc, b.getI32Type(), -1);
+ } else if (chipset.majorVersion >= 10) {
+ Value uIntMaxConst = b.create<arith::ConstantOp>(loc, b.getI32Type(),
+ b.getI32IntegerAttr(-1));
dppResult = b.create<ROCDL::PermlaneX16Op>(
loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
true, false);
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 7515e9050240d..97f9e33290f35 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -91,7 +91,7 @@ struct TestGpuSubgroupReduceLoweringPass
PatternBenefit(3));
if (expandToShuffles) {
auto maybeChipset = amdgpu::Chipset::parse(target);
- if (!failed(maybeChipset)) {
+ if (succeeded(maybeChipset)) {
populateGpuLowerSubgroupReduceToDPPPatterns(
patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
}
>From 8fe131c9cdcaf95d936b77632ca510ed592a3ada Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Mon, 14 Apr 2025 17:01:12 -0500
Subject: [PATCH 16/23] removing unnecessary header
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index a01b182501f36..b0803ff050391 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -26,7 +26,6 @@
#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
-#include <llvm-14/llvm/Support/ErrorHandling.h>
using namespace mlir;
>From bcac689234a66917797a2ec8a8df2511c945274a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 00:27:29 -0500
Subject: [PATCH 17/23] Addressing PR comments
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Dialect/GPU/Transforms/Passes.h | 4 +++
.../GPU/Transforms/SubgroupReduceLowering.cpp | 14 ++++++--
.../Dialect/GPU/subgroup-reduce-lowering.mlir | 34 ++++++++++++++-----
mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 2 ++
4 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5b185e262deb0..f113649e0c908 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -71,6 +71,10 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
amdgpu::Chipset chipset,
PatternBenefit benefit = 1);
+void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+ RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+ PatternBenefit benefit = 1);
+
/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b0803ff050391..0a0dc95b0c0d9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -372,8 +372,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
amdgpu::Chipset chipset) {
Value dppResult;
Value result = input;
- const int allRows = 0xf;
- const int allBanks = 0xf;
+ constexpr int allRows = 0xf;
+ constexpr int allBanks = 0xf;
const bool boundCtrl = true;
Value lane31 =
b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
@@ -504,10 +504,18 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit) {
patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
- /*matchClustered=*/true, chipset,
+ /*matchClustered=*/false, chipset,
benefit);
}
+void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+ RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+ PatternBenefit benefit) {
+patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+ /*matchClustered=*/true, chipset,
+ benefit);
+}
+
void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth, PatternBenefit benefit) {
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 8ac1a5561aad6..018ea835ea38c 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -97,6 +97,8 @@ gpu.module @kernels {
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
//
// CHECK-SHFL-LABEL: gpu.func @kernel2(
+ // CHECK-DPP-LABEL: gpu.func @kernel2(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
// CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -114,6 +116,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
+ // CHECK-DPP-LABEL: gpu.func @kernel3(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel3(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -174,6 +178,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
+ // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -196,6 +202,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel4(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
+ // CHECK-DPP-LABEL: gpu.func @kernel4(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -232,6 +240,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
+ // CHECK-DPP-LABEL: gpu.func @kernel4_clustered(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -247,6 +257,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel5(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
+ // CHECK-DPP-LABEL: gpu.func @kernel5(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel5(%arg0: i16) kernel {
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -281,15 +293,15 @@ gpu.module @kernels {
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
- // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
- // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16
- // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
- // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16
- // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16
- // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16
- // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> ()
+ // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+ // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+ // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+ // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+ // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+ // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+ // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> ()
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
"test.consume"(%sum0) : (i16) -> ()
@@ -299,6 +311,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel6(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
+ // CHECK-DPP-LABEL: gpu.func @kernel6(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
// CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
// CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -322,6 +336,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
+ // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
+ // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel {
// CHECK-SHFL-COUNT-5: gpu.shuffle xor
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>)
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 97f9e33290f35..f34b882c1be86 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -94,6 +94,8 @@ struct TestGpuSubgroupReduceLoweringPass
if (succeeded(maybeChipset)) {
populateGpuLowerSubgroupReduceToDPPPatterns(
patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+ populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+ patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
}
populateGpuLowerSubgroupReduceToShufflePatterns(
patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
>From dee4b9a334572d1429d7e6ba05b1634af17d8cab Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 01:06:47 -0500
Subject: [PATCH 18/23] moving permlanex16 changes to another commit
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 +---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 16 --------------
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ------
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 14 -------------
.../GPU/Transforms/SubgroupReduceLowering.cpp | 21 ++++++++++---------
mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir | 8 -------
6 files changed, 12 insertions(+), 57 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 17c1162170073..108d7237ff703 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,8 +524,7 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
I32EnumAttrCase<"row_mirror", 8>,
I32EnumAttrCase<"row_half_mirror", 9>,
I32EnumAttrCase<"row_bcast_15", 10>,
- I32EnumAttrCase<"row_bcast_31", 11>,
- I32EnumAttrCase<"row_share", 12>
+ I32EnumAttrCase<"row_bcast_31", 11>
]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::amdgpu";
@@ -558,7 +557,6 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
- Reverse within a half-row (`row_half_mirror`)
- Broadcast the 15th lane of each row to the next row (`row_bcast`)
- Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
- - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
}];
let results = (outs AnyType:$result);
let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 186a4f53f93cb..900155c274b4d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -668,22 +668,6 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
}];
}
-// PermLaneX16 intrinsic operation
-def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
- [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
- [4, 5], ["fi", "boundControl"]>,
- Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
- I1Attr:$fi, I1Attr:$boundControl)> {
- let results = (outs LLVM_Type:$res);
- let assemblyFormat = [{
- attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0) `,` type($src1)
- }];
- let description = [{
- Performs a `permlanex16` operation with the given operands, applying the
- permutation specified by $fi to the provided inputs.
- }];
-}
-
def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
BuildableType<"::mlir::VectorType::get("
"{2},$_builder.getI16Type())">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4d343c8f3200c..5f697bdeef566 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1293,7 +1293,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
ROW_HALF_MIRROR = 0x141,
BCAST15 = 0x142,
BCAST31 = 0x143,
- ROW_SHARE0 = 0x150
};
auto kind = DppOp.getKind();
@@ -1351,11 +1350,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
case DPPPerm::row_bcast_31:
DppCtrl = DppCtrl::BCAST31;
break;
- case DPPPerm::row_share:
- if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
- DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
- }
- break;
}
// Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index af4438f028542..549a4376a4a04 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -468,20 +468,6 @@ LogicalResult DPPOp::verify() {
}
break;
}
-
- case DPPPerm::row_share: {
- if (!permArgument) {
- return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
- "' value not specified");
- }
- if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
- uint32_t attrValue = intAttr.getInt();
- if (attrValue < 0 || attrValue > 15) {
- return emitOpError(
- "Attribute value for 'row_share' must be between 0 and 15");
- }
- }
- } break;
}
return success();
}
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 0a0dc95b0c0d9..77201f319164f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -366,7 +366,7 @@ struct VectorSubgroupReduceToShuffles final
bool matchClustered = false;
};
-Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
gpu::AllReduceOperation mode,
const ClusterInfo &ci,
amdgpu::Chipset chipset) {
@@ -418,12 +418,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
dppResult = b.create<amdgpu::DPPOp>(
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
- } else if (chipset.majorVersion >= 10) {
- Value uIntMaxConst = b.create<arith::ConstantOp>(loc, b.getI32Type(),
- b.getI32IntegerAttr(-1));
- dppResult = b.create<ROCDL::PermlaneX16Op>(
- loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
- true, false);
+ } else {
+ return std::nullopt;
}
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
result, dppResult);
@@ -479,9 +475,14 @@ struct ScalarSubgroupReduceToDPP final
op, "value type is not a compatible scalar");
Location loc = op.getLoc();
- rewriter.replaceOp(op,
- createSubgroupDPPReduction(rewriter, loc, op.getValue(),
- op.getOp(), *ci, chipset));
+ std::optional<Value> dpp = createSubgroupDPPReduction(
+ rewriter, loc, op.getValue(), op.getOp(), *ci, chipset);
+ if (!dpp)
+ return rewriter.notifyMatchFailure(
+ op, "Subgroup reduce lowering to DPP not currently supported for "
+ "this device.");
+
+ rewriter.replaceOp(op, *dpp);
return success();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
index 64b3328b70ab4..14691e73e62d7 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,11 +137,3 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
%0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
return %0 : f16
}
-
-func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {
- // CHECK-LABEL: func @dpp_row_share
- // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32
- // CHECK: return %0 : i32
- %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32
- return %0 : i32
-}
>From 329376bdfbe309b254fd47878720fbc20553647c Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 02:27:29 -0500
Subject: [PATCH 19/23] fixing test
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../Dialect/GPU/subgroup-reduce-lowering.mlir | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 018ea835ea38c..11db35e31588b 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -32,11 +32,15 @@ gpu.module @kernels {
// CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
// CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
// CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum0) : (vector<5xf16>) -> ()
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
// CHECK-SUB: "test.consume"
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum1) : (vector<5xf16>) -> ()
@@ -66,11 +70,15 @@ gpu.module @kernels {
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
// CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
// CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum0) : (vector<1xf32>) -> ()
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
// CHECK-SUB: "test.consume"
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum1) : (vector<1xf32>) -> ()
@@ -84,6 +92,7 @@ gpu.module @kernels {
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
// CHECK-SUB: "test.consume"
+ // CHECK-DPP-NOT: amdgpu.dpp
%sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -137,6 +146,9 @@ gpu.module @kernels {
// CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
// CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
// CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
+
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
"test.consume"(%sum0) : (i32) -> ()
@@ -258,7 +270,6 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel5(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
// CHECK-DPP-LABEL: gpu.func @kernel5(
- // CHECK-DPP-NOT: amdgpu.dpp
gpu.func @kernel5(%arg0: i16) kernel {
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -270,6 +281,8 @@ gpu.module @kernels {
// CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+ // CHECK-DPP-COUNT-6: amdgpu.dpp
+ // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
"test.consume"(%sum0) : (i16) -> ()
>From a8c410d9d49c228e267fa9258a909fb4f3f33f6a Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 10:38:42 -0500
Subject: [PATCH 20/23] fixing code formatting
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../mlir/Dialect/GPU/Transforms/Passes.h | 14 ++++++-----
.../GPU/Transforms/SubgroupReduceLowering.cpp | 23 ++++++++++---------
mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 10 ++++----
3 files changed, 25 insertions(+), 22 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index f113649e0c908..a13ad33df29cd 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -63,6 +63,12 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
+/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
+/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
+void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
+ RewritePatternSet &patterns, unsigned subgroupSize,
+ unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
+
/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
/// ops over scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Applicable only to AMD GPUs.
@@ -71,16 +77,12 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
amdgpu::Chipset chipset,
PatternBenefit benefit = 1);
+/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
+/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit = 1);
-/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
-/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
-void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
- RewritePatternSet &patterns, unsigned subgroupSize,
- unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
-
/// Collect all patterns to rewrite ops within the GPU dialect.
inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
populateGpuAllReducePatterns(patterns);
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 77201f319164f..55176f5b10959 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -10,13 +10,13 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Location.h"
@@ -366,10 +366,11 @@ struct VectorSubgroupReduceToShuffles final
bool matchClustered = false;
};
-std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
- gpu::AllReduceOperation mode,
- const ClusterInfo &ci,
- amdgpu::Chipset chipset) {
+std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
+ Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci,
+ amdgpu::Chipset chipset) {
Value dppResult;
Value result = input;
constexpr int allRows = 0xf;
@@ -510,11 +511,11 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
}
void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
- RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
- PatternBenefit benefit) {
-patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
- /*matchClustered=*/true, chipset,
- benefit);
+ RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+ PatternBenefit benefit) {
+ patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+ /*matchClustered=*/true, chipset,
+ benefit);
}
void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index f34b882c1be86..fe402da4cc105 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -31,9 +31,8 @@ struct TestGpuRewritePass
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass)
void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect,
- func::FuncDialect, index::IndexDialect,
- memref::MemRefDialect, ROCDL::ROCDLDialect>();
+ registry.insert<arith::ArithDialect, func::FuncDialect, index::IndexDialect,
+ memref::MemRefDialect>();
}
StringRef getArgument() const final { return "test-gpu-rewrite"; }
StringRef getDescription() const final {
@@ -58,8 +57,9 @@ struct TestGpuSubgroupReduceLoweringPass
: PassWrapper(pass) {}
void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
- ROCDL::ROCDLDialect, vector::VectorDialect>();
+ registry
+ .insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
+ ROCDL::ROCDLDialect, vector::VectorDialect>();
}
StringRef getArgument() const final {
>From bcc7e43732ef6dbf278297b6bd9eba6b569ac977 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 15:28:57 -0500
Subject: [PATCH 21/23] Updating implementation to support gfx 10+
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
.../GPU/Transforms/SubgroupReduceLowering.cpp | 148 +++++++++++-------
1 file changed, 90 insertions(+), 58 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 55176f5b10959..c1dedd9216a14 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -22,6 +22,7 @@
#include "mlir/IR/Location.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
@@ -371,72 +372,103 @@ std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
gpu::AllReduceOperation mode,
const ClusterInfo &ci,
amdgpu::Chipset chipset) {
- Value dppResult;
Value result = input;
constexpr int allRows = 0xf;
constexpr int allBanks = 0xf;
const bool boundCtrl = true;
- Value lane31 =
- b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
- Value lane63 =
- b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(63));
- if (ci.clusterSize >= 2) {
- auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
- dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::quad_perm, permArg,
- allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 4) {
- auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
- dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
- amdgpu::DPPPerm::quad_perm, permArg,
- allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 8) {
- dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
- b.getUnitAttr(), allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 16) {
- dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
- b.getUnitAttr(), allRows, allBanks, boundCtrl);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- }
-
- if (ci.clusterSize >= 32) {
- if (chipset.majorVersion <= 9) {
+ Value lane0 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(0));
+ Value lane32 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(32));
+
+ auto dppReduceAcrossLanes = [&](int numLanes,
+ Value res) -> std::optional<Value> {
+ Value dppResult, laneVal;
+
+ switch (numLanes) {
+ case 2:
+ // Perform reduction between all lanes N <-> N+1.
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+ b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+ break;
+ case 4:
+ // Perform reduction between all lanes N <-> N+2.
dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
- b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
- } else {
+ loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+ b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+ break;
+ case 8:
+ // Perform reduction between all lanes N <-> 7-N,
+ // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ break;
+ case 16:
+ // Perform reduction between all lanes N <-> 15-N,
+ // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ break;
+ case 32:
+ if (chipset.majorVersion <= 9) {
+ // Broadcast last value from each row to next row.
+ // Use row mask to avoid polluting rows 1 and 3.
+ dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+ amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 0xa, allBanks,
+ /*bound_ctrl*/ false);
+ } else if (chipset.majorVersion <= 12) {
+ // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+ dppResult = b.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+ -1, -1, /*fi=*/true,
+ /*bound_ctrl=*/false);
+ if (ci.subgroupSize == 32) {
+ dppResult =
+ b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+ }
+ } else {
+ return std::nullopt;
+ }
+ break;
+ case 64:
+ if (chipset.majorVersion <= 9) {
+ // Broadcast 31st lane value to rows 2 and 3.
+ // Use row mask to avoid polluting rows 0 and 1.
+ dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+ amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 0xc, allBanks,
+ /*bound_ctrl*/ false);
+ } else if (chipset.majorVersion <= 12) {
+ // Assume reduction across 32 lanes has been done.
+ // Perform final reduction manually by summing values in lane 0 and
+ // lane 32.
+ dppResult =
+ b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+ laneVal = b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+ return vector::makeArithReduction(
+ b, loc, gpu::convertReductionKind(mode), dppResult, laneVal);
+ } else {
+ return std::nullopt;
+ }
+ break;
+ default:
+ // Should never reach here given previous validation of ClusterInfo.
+ llvm_unreachable("ERROR: Unexpected cluster size.");
return std::nullopt;
}
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- if (ci.subgroupSize == 32) {
- result =
- b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane31);
+ return vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ res, dppResult);
+ };
+
+ for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) {
+ if (auto dpp = dppReduceAcrossLanes(cs, result)) {
+ result = *dpp;
+ continue;
}
- }
-
- if (ci.clusterSize == 64) {
- dppResult = b.create<amdgpu::DPPOp>(
- loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
- b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false);
- result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
- result, dppResult);
- result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+ return std::nullopt;
}
assert(result.getType() == input.getType());
>From 93278aa409fbf23f954784ee66f41c5d566f7c97 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 15:54:08 -0500
Subject: [PATCH 22/23] Small formatting change
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index c1dedd9216a14..28c569ee2bd83 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -463,7 +463,7 @@ std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
res, dppResult);
};
- for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) {
+ for (unsigned cs = 2; cs <= ci.clusterSize; cs <<= 1) {
if (auto dpp = dppReduceAcrossLanes(cs, result)) {
result = *dpp;
continue;
>From 987c4d7bcc861efdecf8ad3c6da0e3e2aad64cb6 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed at amd.com>
Date: Wed, 16 Apr 2025 16:50:03 -0500
Subject: [PATCH 23/23] Removing ReadlaneOps from test
Signed-off-by: Muzammiluddin Syed <muzasyed at amd.com>
---
mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir | 6 ------
1 file changed, 6 deletions(-)
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 11db35e31588b..139edf6882df6 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -33,14 +33,12 @@ gpu.module @kernels {
// CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
// CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum0) : (vector<5xf16>) -> ()
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
// CHECK-SUB: "test.consume"
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum1) : (vector<5xf16>) -> ()
@@ -71,14 +69,12 @@ gpu.module @kernels {
// CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
// CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum0) : (vector<1xf32>) -> ()
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
// CHECK-SUB: "test.consume"
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum1) : (vector<1xf32>) -> ()
@@ -148,7 +144,6 @@ gpu.module @kernels {
// CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
"test.consume"(%sum0) : (i32) -> ()
@@ -282,7 +277,6 @@ gpu.module @kernels {
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
// CHECK-DPP-COUNT-6: amdgpu.dpp
- // CHECK-DPP: rocdl.readlane
%sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
"test.consume"(%sum0) : (i16) -> ()
More information about the Mlir-commits
mailing list