[Mlir-commits] [mlir] [mlir][gpu] Allow subgroup reductions over 1-d vector types (PR #76015)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Dec 19 22:19:03 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-mlir-gpu
Author: Jakub Kuderski (kuhar)
<details>
<summary>Changes</summary>
Each vector element is reduced independently, which is a form of multi-reduction.
The plan is to allow for gradual lowering of multi-redictions that results in fewer `gpu.shuffle` ops at the end:
1d `vector.multi_reduction` --> 1d `gpu.subgroup_reduce` --> smaller 1d `gpu.subgroup_reduce` --> packed `gpu.shuffle` over i32 or i64
For example we can perform 4 independent f16 reductions with a series of `gpu.shuffles` over i64, reducing the final number of `gpu.shuffles` by 4x.
---
Full diff: https://github.com/llvm/llvm-project/pull/76015.diff
4 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+12-4)
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+10-1)
- (modified) mlir/test/Dialect/GPU/invalid.mlir (+11-3)
- (modified) mlir/test/Dialect/GPU/ops.mlir (+5)
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2e21cd77d2d83b..7777dd58eba1f7 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -19,10 +19,11 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
include "mlir/Dialect/GPU/IR/CompilationAttrs.td"
include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
+include "mlir/IR/CommonTypeConstraints.td"
include "mlir/IR/EnumAttr.td"
-include "mlir/Interfaces/FunctionInterfaces.td"
include "mlir/IR/SymbolInterfaces.td"
include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
include "mlir/Interfaces/InferIntRangeInterface.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -1022,16 +1023,23 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
let hasRegionVerifier = 1;
}
+def AnyIntegerOrFloatOr1DVector :
+ AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
+
def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
let summary = "Reduce values among subgroup.";
let description = [{
The `subgroup_reduce` op reduces the value of every work item across a
subgroup. The result is equal for all work items of a subgroup.
+ When the reduced value is of a vector type, each vector element is reduced
+ independently. Only 1-d vector types are allowed.
+
Example:
```mlir
- %1 = gpu.subgroup_reduce add %0 : (f32) -> (f32)
+ %1 = gpu.subgroup_reduce add %a : (f32) -> (f32)
+ %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
```
If `uniform` flag is set either none or all work items of a subgroup
@@ -1044,11 +1052,11 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
}];
let arguments = (ins
- AnyIntegerOrFloat:$value,
+ AnyIntegerOrFloatOr1DVector:$value,
GPU_AllReduceOperationAttr:$op,
UnitAttr:$uniform
);
- let results = (outs AnyIntegerOrFloat:$result);
+ let results = (outs AnyIntegerOrFloatOr1DVector:$result);
let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
(`uniform` $uniform^)? attr-dict
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 7c3330f4c238f8..dd482f305fcbc8 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -19,6 +19,7 @@
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
@@ -588,8 +589,16 @@ static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
//===----------------------------------------------------------------------===//
LogicalResult gpu::SubgroupReduceOp::verify() {
+ Type elemType = getType();
+ if (auto vecTy = dyn_cast<VectorType>(elemType)) {
+ if (vecTy.isScalable())
+ return emitOpError() << "is not compatible with scalable vector types";
+
+ elemType = vecTy.getElementType();
+ }
+
gpu::AllReduceOperation opName = getOp();
- if (failed(verifyReduceOpAndType(opName, getType()))) {
+ if (failed(verifyReduceOpAndType(opName, elemType))) {
return emitError() << '`' << gpu::stringifyAllReduceOperation(opName)
<< "` reduction operation is not compatible with type "
<< getType();
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index d8a40f89f80ac2..8a34d64326072b 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -333,9 +333,17 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
// -----
-func.func @subgroup_reduce_bad_type(%arg0 : vector<2xf32>) {
- // expected-error at +1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float}}
- %res = gpu.subgroup_reduce add %arg0 : (vector<2xf32>) -> vector<2xf32>
+func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
+ // expected-error at +1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
+ %res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
+ return
+}
+
+// -----
+
+func.func @subgroup_reduce_bad_type_scalable(%arg0 : vector<[2]xf32>) {
+ // expected-error at +1 {{is not compatible with scalable vector types}}
+ %res = gpu.subgroup_reduce add %arg0 : (vector<[2]xf32>) -> vector<[2]xf32>
return
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 48193436415637..c3b548062638f1 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -84,6 +84,8 @@ module attributes {gpu.container_module} {
%one = arith.constant 1.0 : f32
+ %vec = vector.broadcast %arg0 : f32 to vector<4xf32>
+
// CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
// CHECK-NEXT: } : (f32) -> f32
%sum = gpu.all_reduce add %one {} : (f32) -> (f32)
@@ -98,6 +100,9 @@ module attributes {gpu.container_module} {
// CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
%sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
+ // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32
+ %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
+
%width = arith.constant 7 : i32
%offset = arith.constant 3 : i32
// CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32
``````````
</details>
https://github.com/llvm/llvm-project/pull/76015
More information about the Mlir-commits
mailing list