[Mlir-commits] [mlir] a3388f3 - [mlir] Introduce a pattern to lower `gpu.subgroup_reduce` to `nvvm.redux_op`
Guray Ozen
llvmlistbot at llvm.org
Fri Jan 20 04:56:29 PST 2023
Author: Guray Ozen
Date: 2023-01-20T13:56:23+01:00
New Revision: a3388f3e2a30eaca4123f14a6444fd5aa03b24c5
URL: https://github.com/llvm/llvm-project/commit/a3388f3e2a30eaca4123f14a6444fd5aa03b24c5
DIFF: https://github.com/llvm/llvm-project/commit/a3388f3e2a30eaca4123f14a6444fd5aa03b24c5.diff
LOG: [mlir] Introduce a pattern to lower `gpu.subgroup_reduce` to `nvvm.redux_op`
This revision introduces a pattern to lower `gpu.subgroup_reduce` op into to the `nvvm.redux_sync` op. The op must be run by the entire subgroup, otherwise it is undefined behaviour.
It also adds a flag and populate function, because the op is not avaiable for every gpu (sm80+), so it can be used when it is desired.
Depends on D142088
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D142103
Added:
Modified:
mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
mlir/include/mlir/Conversion/Passes.td
mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
index e05e13865e1bc..46f29c6dd8b92 100644
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -37,6 +37,11 @@ void configureGpuToNVVMConversionLegality(ConversionTarget &target);
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns);
+/// Populate GpuSubgroupReduce pattern to NVVM. It generates a specific nvvm
+/// op that is not available on every GPU.
+void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter,
+ RewritePatternSet &patterns);
+
/// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns);
@@ -45,7 +50,8 @@ void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
/// index bitwidth used for the lowering of the device side index computations
/// is configurable.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(
- unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout);
+ unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
+ bool hasRedux = false);
} // namespace mlir
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 40ade95a09f95..16a64caf1d7f5 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -359,7 +359,9 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
let options = [
Option<"indexBitwidth", "index-bitwidth", "unsigned",
/*default=kDeriveIndexBitwidthFromDataLayout*/"0",
- "Bitwidth of the index type, 0 to use size of machine word">
+ "Bitwidth of the index type, 0 to use size of machine word">,
+ Option<"hasRedux", "has-redux", "bool", /*default=*/"false",
+ "Target gpu supports redux">,
];
}
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 40fd9520a563a..cefe493ecbc05 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -58,6 +58,60 @@ static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
llvm_unreachable("unknown shuffle mode");
}
+static Optional<NVVM::ReduxKind>
+convertReduxKind(gpu::AllReduceOperation mode) {
+ switch (mode) {
+ case gpu::AllReduceOperation::ADD:
+ return NVVM::ReduxKind::ADD;
+ case gpu::AllReduceOperation::AND:
+ return NVVM::ReduxKind::AND;
+ case gpu::AllReduceOperation::MAX:
+ return NVVM::ReduxKind::MAX;
+ case gpu::AllReduceOperation::MIN:
+ return NVVM::ReduxKind::MIN;
+ case gpu::AllReduceOperation::OR:
+ return NVVM::ReduxKind::OR;
+ case gpu::AllReduceOperation::XOR:
+ return NVVM::ReduxKind::XOR;
+ case gpu::AllReduceOperation::MUL:
+ return std::nullopt;
+ }
+ return std::nullopt;
+}
+
+/// This pass lowers gpu.subgroup_reduce op into to the nvvm.redux op. The op
+/// must be run by the entire subgroup, otherwise it is undefined behaviour.
+struct GPUSubgroupReduceOpLowering
+ : public ConvertOpToLLVMPattern<gpu::SubgroupReduceOp> {
+ using ConvertOpToLLVMPattern<gpu::SubgroupReduceOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+
+ matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (!op.getUniform())
+ return rewriter.notifyMatchFailure(
+ op, "cannot be lowered to redux as the op must be run "
+ "uniformly (entire subgroup).");
+ if (!op.getValue().getType().isInteger(32))
+ return rewriter.notifyMatchFailure(op, "unsupported data type");
+
+ Optional<NVVM::ReduxKind> mode = convertReduxKind(op.getOp());
+ if (!mode.has_value())
+ return rewriter.notifyMatchFailure(
+ op, "unsupported reduction mode for redux");
+
+ Location loc = op->getLoc();
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Value offset = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
+
+ auto reduxOp = rewriter.create<NVVM::ReduxOp>(loc, int32Type, op.getValue(),
+ mode.value(), offset);
+
+ rewriter.replaceOp(op, reduxOp->getResult(0));
+ return success();
+ }
+};
+
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
@@ -155,8 +209,9 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
struct LowerGpuOpsToNVVMOpsPass
: public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
LowerGpuOpsToNVVMOpsPass() = default;
- LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
+ LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) {
this->indexBitwidth = indexBitwidth;
+ this->hasRedux = hasRedux;
}
void runOnOperation() override {
@@ -229,6 +284,8 @@ struct LowerGpuOpsToNVVMOpsPass
populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
+ if (this->hasRedux)
+ populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
LLVMConversionTarget target(getContext());
configureGpuToNVVMConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
@@ -259,6 +316,11 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
}
+void mlir::populateGpuSubgroupReduceOpLoweringPattern(
+ LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+ patterns.add<GPUSubgroupReduceOpLowering>(converter);
+}
+
void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns) {
populateWithGenerated(patterns);
@@ -323,6 +385,6 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
- return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
+mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) {
+ return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth, hasRedux);
}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 4a7e6f5c6631b..653902824afd6 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()
@@ -574,3 +574,44 @@ gpu.module @test_module {
}
}
+// -----
+
+gpu.module @test_module {
+ // CHECK-LABEL: func @subgroup_reduce_add
+ gpu.func @subgroup_reduce_add(%arg0 : i32) {
+ // CHECK: nvvm.redux.sync add {{.*}}
+ %result = gpu.subgroup_reduce add %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+ // CHECK-LABEL: func @subgroup_reduce_and
+ gpu.func @subgroup_reduce_and(%arg0 : i32) {
+ // CHECK: nvvm.redux.sync and {{.*}}
+ %result = gpu.subgroup_reduce and %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+ // CHECK-LABEL: @subgroup_reduce_max
+ gpu.func @subgroup_reduce_max(%arg0 : i32) {
+ // CHECK: nvvm.redux.sync max {{.*}}
+ %result = gpu.subgroup_reduce max %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+ // CHECK-LABEL: @subgroup_reduce_min
+ gpu.func @subgroup_reduce_min(%arg0 : i32) {
+ // CHECK: nvvm.redux.sync min {{.*}}
+ %result = gpu.subgroup_reduce min %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+ // CHECK-LABEL: @subgroup_reduce_or
+ gpu.func @subgroup_reduce_or(%arg0 : i32) {
+ // CHECK: nvvm.redux.sync or {{.*}}
+ %result = gpu.subgroup_reduce or %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+ // CHECK-LABEL: @subgroup_reduce_xor
+ gpu.func @subgroup_reduce_xor(%arg0 : i32) {
+ // CHECK nvvm.redux.sync xor {{.*}}
+ %result = gpu.subgroup_reduce xor %arg0 uniform {} : (i32) -> (i32)
+ gpu.return
+ }
+}
+
More information about the Mlir-commits
mailing list