[Mlir-commits] [mlir] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Mar 26 21:17:55 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Muzammil (Muzammiluddin-Syed-ECE)
<details>
<summary>Changes</summary>
[DRAFT]
See related [Issue](https://github.com/iree-org/iree/issues/20007)
We can better leverage DPP ops in the AMDGPU dialect when lowering subgroup reduce ops.
To this end this PR implements a new pass where we perform such a lowering.
To do:
- Improve lowering to subgroup_reduce in compatible matvecs (these get directly lowered to gpu.shuffles in an earlier pass)
- Add test for pass
---
Patch is 52.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133204.diff
10 Files Affected:
- (added) mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h (+35)
- (modified) mlir/include/mlir/Conversion/Passes.h (+1)
- (modified) mlir/include/mlir/Conversion/Passes.td (+16)
- (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+1)
- (modified) mlir/lib/Conversion/CMakeLists.txt (+1)
- (added) mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt (+22)
- (added) mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp (+203)
- (modified) mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt (+1)
- (modified) mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp (+161-1)
- (added) mlir/test/Conversion/GPUToAMDGPU/gpu-to-amdgpu.mlir (+463)
``````````diff
diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
new file mode 100644
index 0000000000000..fea9b7ed50bcc
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h
@@ -0,0 +1,35 @@
+//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
+
+
+#include "mlir/IR/PatternMatch.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+
+class LLVMTypeConverter;
+class RewritePatternSet;
+class TypeConverter;
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateSubgroupReduceLoweringPatterns(LLVMTypeConverter &converter,
+ RewritePatternSet &patterns,
+ unsigned subgroupSize,
+ PatternBenefit benefit);
+// void populateGPUToAMDGPUConversionPatterns(LLVMTypeConverter &converter,
+// RewritePatternSet &patterns);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
\ No newline at end of file
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index ccd862f67c068..1189423799092 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -34,6 +34,7 @@
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
#include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..6a1deeb230794 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
];
}
+//===----------------------------------------------------------------------===//
+// GPUToAMDGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
+ let summary = "Generate AMDGPU operations for gpu operations";
+ let dependentDialects = [
+ "LLVM::LLVMDialect",
+ "::mlir::gpu::GPUDialect",
+ "amdgpu::AMDGPUDialect",
+ ];
+ let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
+ /*default=*/"64",
+ "Size of subgroup">];
+}
+
//===----------------------------------------------------------------------===//
// ConvertIndexToLLVMPass
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 949424db7c4d6..5296f75571188 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1214,6 +1214,7 @@ struct ConvertAMDGPUToROCDLPass
using Base::Base;
void runOnOperation() override {
+ llvm::errs() << " WHEN DOES AMDGPU TO ROCDL RUN\n";
MLIRContext *ctx = &getContext();
FailureOr<Chipset> maybeChipset = Chipset::parse(chipset);
if (failed(maybeChipset)) {
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index b6c21440c571c..b957a4473f1e6 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC)
add_subdirectory(FuncToLLVM)
add_subdirectory(FuncToSPIRV)
add_subdirectory(GPUCommon)
+add_subdirectory(GPUToAMDGPU)
add_subdirectory(GPUToLLVMSPV)
add_subdirectory(GPUToNVVM)
add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
new file mode 100644
index 0000000000000..9b82b5dc63d9c
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_mlir_conversion_library(MLIRGPUToAMDGPU
+ GPUToAMDGPU.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
+
+ DEPENDS
+ MLIRConversionPassIncGen
+
+ LINK_COMPONENTS
+ Core
+
+ LINK_LIBS PUBLIC
+ MLIRLLVMCommonConversion
+ MLIRLLVMDialect
+ MLIRGPUDialect
+ MLIRAMDGPUDialect
+ MLIRAMDGPUUtils
+ MLIRROCDLDialect
+ MLIRPass
+ MLIRTransforms
+ )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
new file mode 100644
index 0000000000000..c2fc8b2e19ae6
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -0,0 +1,203 @@
+//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
+
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
+#include "../LLVMCommon/MemRefDescriptor.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include <optional>
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ClusterInfo {
+ unsigned clusterStride;
+ unsigned clusterSize;
+ unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+ assert(llvm::isPowerOf2_32(subgroupSize));
+
+ std::optional<uint32_t> clusterSize = op.getClusterSize();
+ assert(!clusterSize ||
+ llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+ if (clusterSize && *clusterSize > subgroupSize)
+ return op.emitOpError()
+ << "cluster size " << *clusterSize
+ << " is greater than subgroup size " << subgroupSize;
+ unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+ auto clusterStride = op.getClusterStride();
+ assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+ if (clusterStride >= subgroupSize)
+ return op.emitOpError()
+ << "cluster stride " << clusterStride
+ << " is not less than subgroup size " << subgroupSize;
+
+ return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci) {
+ Value result = input;
+ if (ci.clusterSize >= 2) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 4) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 8) {
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 16) {
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 32) {
+ // auto permArg = builder.getInt32(15);
+ // auto rowMask = builder.getInt32("0xa");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 10, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize == 64) {
+ // auto permArg = builder.getInt32(31);
+ // auto rowMask = builder.getInt32("0xc");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 12, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ // // read lane 63 with the final result.
+ // auto lane = b.getIntegerAttr(b.getIntegerType(32), 63);
+ // result = b.create<ROCDL::ReadLaneOp>(loc, input.getType(), result, lane);
+ assert(result.getType() == input.getType());
+ return result;
+}
+
+struct ScalarSubgroupReduceToShuffles final
+ : OpRewritePattern<gpu::SubgroupReduceOp> {
+ ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
+ bool matchClustered, PatternBenefit benefit)
+ : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+ matchClustered(matchClustered) {}
+
+ LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+ PatternRewriter &rewriter) const override {
+ llvm::errs() << "ScalarSubgroupReduceToShuffles" << "\n";
+ if (op.getClusterSize().has_value() != matchClustered) {
+ return rewriter.notifyMatchFailure(
+ op, llvm::formatv("op is {0}clustered but pattern is configured to "
+ "only match {1}clustered ops",
+ matchClustered ? "non-" : "",
+ matchClustered ? "" : "non-"));
+ }
+
+ auto ci = getAndValidateClusterInfo(op, subgroupSize);
+ if (failed(ci))
+ return failure();
+
+ Location loc = op.getLoc();
+ rewriter.replaceOp(op, createSubgroupDPPReduction(
+ rewriter, loc, op.getValue(), op.getOp(), *ci));
+ return success();
+ }
+
+private:
+ unsigned subgroupSize = 0;
+ bool matchClustered = false;
+};
+
+struct ConvertGPUToAMDGPUPass
+ : public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
+ using Base::Base;
+
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ LLVMTypeConverter converter(&getContext());
+ LLVMConversionTarget target(getContext());
+ target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+ target.addLegalDialect<::mlir::amdgpu::AMDGPUDialect>();
+ target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>();
+
+ int subgroupSizeInt = static_cast<int>(subgroupSize);
+ populateSubgroupReduceLoweringPatterns(converter, patterns, subgroupSizeInt,
+ PatternBenefit(1));
+ if (failed(applyPartialConversion(getOperation(), target,
+ std::move(patterns))))
+ signalPassFailure();
+ }
+};
+} // namespace
+
+void mlir::populateSubgroupReduceLoweringPatterns(
+ LLVMTypeConverter &converter, RewritePatternSet &patterns, unsigned subgroupSize, PatternBenefit benefit) {
+ patterns.add<ScalarSubgroupReduceToShuffles>(
+ patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
+}
\ No newline at end of file
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 945e3ccdfa87b..52484ac69a3e2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
MLIRMathToLLVM
MLIRMathToROCDL
MLIRAMDGPUToROCDL
+ MLIRGPUToAMDGPU
MLIRFuncToLLVM
MLIRGPUDialect
MLIRGPUToGPURuntimeTransforms
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 43eff3eddcc49..0b553274eceb4 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -11,10 +11,12 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/PatternMatch.h"
@@ -24,6 +26,8 @@
#include <cassert>
#include <cstdint>
+#define DPP
+
using namespace mlir;
namespace {
@@ -188,6 +192,8 @@ Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc,
function_ref<Value(Value)> unpackFn) {
// Lane value always stays in the original type. We use it to perform arith
// reductions.
+ llvm::errs() << "Cluster Stride: " << ci.clusterStride << "\n";
+ llvm::errs() << "Cluster Size: " << ci.clusterSize << "\n";
Value laneVal = input;
// Parallel reduction using butterfly shuffles.
for (unsigned i = ci.clusterStride; i < ci.clusterStride * ci.clusterSize;
@@ -206,6 +212,146 @@ Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc,
return laneVal;
}
+#ifdef DPP
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc,
+ Value input, gpu::AllReduceOperation mode,
+ const ClusterInfo &ci,
+ function_ref<Value(Value)> packFn,
+ function_ref<Value(Value)> unpackFn) {
+ llvm::errs() << "createSubgroupDPPReduction" << "\n";
+ Value result = input;
+ if (ci.clusterSize >= 2) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_shr, permArg);
+ llvm::errs() << dppResult << " c 2 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 4) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_shr, permArg);
+ llvm::errs() << dppResult << " c 4 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 8) {
+
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, b.getUnitAttr());
+ llvm::errs() << dppResult << " c 8 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 16) {
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ llvm::errs() << dppResult << " c 16 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 32) {
+ // auto permArg = builder.getInt32(15);
+ // auto rowMask = builder.getInt32("0xa");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, b.getUnitAttr(), 10, 15, false);
+ llvm::errs() << dppResult << " c 32 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize == 64) {
+ // auto permArg = builder.getInt32(31);
+ // auto rowMask = builder.getInt32("0xc");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ Value dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, b.getUnitAttr(), 12, 15, false);
+ llvm::errs() << dppResult << " c 64 \n";
+ result = vector::makeArithReduction(b, loc,
+ gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ // // read lane 63 with the final result.
+ // auto lane = b.getIntegerAttr(b.getIntegerType(32), 63);
+ // result = b.create<ROCDL::ReadLaneOp>(loc, input.getType(), result, lane);
+ assert(result.getType() == input.getType());
+ return result;
+}
+#endif
+
+// Value createSubgroupDPPReduction(OpBuilder &b, Location loc,
+// Value input, gpu::AllReduceOperation mode,
+// const ClusterInfo &ci,
+// function_ref<Value(Value)> packFn,
+// function_ref<Value(Value)> unpackFn) {
+
+// Value result = input;
+// if (ci.clusterSize >= 2) {
+// auto permArg = b.getInt32(1);
+// Value dppResult = builder.create<amdgpu::DPPOp>(packFn(result), packFn(result), amdgpu::DPPPerm::row_shr, permArg);
+// result = vector::makeArithReduction(builder, loc,
+// gpu::convertReductionKind(mode),
+// result, unpackFn(dppResult));
+// }
+
+// if (ci.clusterSize >= 4) {
+// auto permArg = builder.getInt32(2);
+// Value dppResult = builder.create<amdgpu::DPPOp>(packFn(result), packFn(result), amdgpu::DPPPerm::row_shr, permArg);
+// result = vector::makeArithReduction(builder, loc,
+// gpu::convertReductionKind(mode),
+// result, unpackFn(dppResult));
+// }
+
+// if (ci.clusterSize >= 8) {
+// Value dppResult = builder.create<amdgpu::DPPOp>(packFn(result), packFn(result), amdgpu::DPPPerm::row_half_mirror);
+// result = vector::makeArithReduction(builder, loc,
+// gpu::convertReductionKind(mode),
+// result, unpackFn(dppResult));
+// }
+
+// if (ci.clusterSize >= 16) {
+// Value dppResult = builder.create<amdgpu::DPPOp>(packFn(result), packFn(result), amdgpu::DPPPerm::row_mirror);
+// result = vector::makeArithReduction(builder, loc,
+// gpu::convertReductionKind(mode),
+// result, unpackFn(dppResult));
+// }
+
+// if (ci.clusterSize >= 32) {
+// auto permArg = builder.getInt32(15);...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/133204
More information about the Mlir-commits
mailing list