[Mlir-commits] [mlir] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)
Krzysztof Drewniak
llvmlistbot at llvm.org
Fri Mar 28 14:01:23 PDT 2025
================
@@ -0,0 +1,185 @@
+//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ClusterInfo {
+ unsigned clusterStride;
+ unsigned clusterSize;
+ unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+ assert(llvm::isPowerOf2_32(subgroupSize));
+
+ std::optional<uint32_t> clusterSize = op.getClusterSize();
+ assert(!clusterSize ||
+ llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+ if (clusterSize && *clusterSize > subgroupSize)
+ return op.emitOpError()
+ << "cluster size " << *clusterSize
+ << " is greater than subgroup size " << subgroupSize;
+ unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+ auto clusterStride = op.getClusterStride();
+ assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+ if (clusterStride >= subgroupSize)
+ return op.emitOpError()
+ << "cluster stride " << clusterStride
+ << " is not less than subgroup size " << subgroupSize;
+
+ return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci) {
+ Value result = input;
+ if (ci.clusterSize >= 2) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 4) {
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_shr, permArg);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 8) {
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 16) {
+ Value dppResult =
+ b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+ amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize >= 32) {
+ // auto permArg = builder.getInt32(15);
+ // auto rowMask = builder.getInt32("0xa");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 10, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ if (ci.clusterSize == 64) {
+ // auto permArg = builder.getInt32(31);
+ // auto rowMask = builder.getInt32("0xc");
+ // auto bankMask = builder.getInt32("0xf");
+ // auto boundCtrl = builder.getBoolAttr(false);
+ auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+ Value dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 12, 15, false);
+ result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+ result, dppResult);
+ }
+
+ // // read lane 63 with the final result.
+ // auto lane = b.getIntegerAttr(b.getIntegerType(32), 63);
+ // result = b.create<ROCDL::ReadLaneOp>(loc, input.getType(), result, lane);
----------------
krzysz00 wrote:
You left some stuff commented out
https://github.com/llvm/llvm-project/pull/133204
More information about the Mlir-commits
mailing list