[Mlir-commits] [mlir] [mlir][gpu] Add 'cluster_stride' attribute to gpu.subgroup_reduce (PR #107142)
Andrea Faulds
llvmlistbot at llvm.org
Wed Sep 4 03:33:18 PDT 2024
================
@@ -140,44 +137,75 @@ struct ScalarizeSingleElementReduce final
Location loc = op.getLoc();
Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
- loc, extracted, op.getOp(), op.getUniform(), clusterSize);
+ loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
+ op.getClusterStride());
rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
return success();
}
};
-/// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
-/// and `unpackFn` to convert to the native shuffle type and to the reduction
-/// type, respectively. For example, with `input` of type `f16`, `packFn` could
-/// build ops to cast the value to `i32` to perform shuffles, while `unpackFn`
-/// would cast it back to `f16` to perform arithmetic reduction on. Assumes that
-/// the subgroup is `subgroupSize` lanes wide and divides it into clusters of
-/// `clusterSize` lanes, reducing all lanes in each cluster in parallel.
-static Value createSubgroupShuffleReduction(
- OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode,
- unsigned clusterSize, unsigned subgroupSize,
- function_ref<Value(Value)> packFn, function_ref<Value(Value)> unpackFn) {
- assert(llvm::isPowerOf2_32(clusterSize));
- assert(llvm::isPowerOf2_32(subgroupSize));
- assert(clusterSize <= subgroupSize);
- // Lane value always stays in the original type. We use it to perform arith
- // reductions.
- Value laneVal = input;
- // Parallel reduction using butterfly shuffles.
- for (unsigned i = 1; i < clusterSize; i <<= 1) {
- Value shuffled = builder
- .create<gpu::ShuffleOp>(loc, packFn(laneVal), i,
- /*width=*/subgroupSize,
- /*mode=*/gpu::ShuffleMode::XOR)
- .getShuffleResult();
- laneVal = vector::makeArithReduction(builder, loc,
- gpu::convertReductionKind(mode),
- laneVal, unpackFn(shuffled));
- assert(laneVal.getType() == input.getType());
+struct ClusterInfo {
+ unsigned clusterStride;
+ unsigned clusterSize;
+ unsigned subgroupSize;
+ LogicalResult getAndValidate(gpu::SubgroupReduceOp op,
+ unsigned subgroupSize) {
----------------
andfau-amd wrote:
I refactored the code around this struct because I felt it was getting a bit unwieldy passing three unsigned integer parameters around, and I wanted to share the logic that fetches them from the op and validates them between the two users. Though there are only two users of that part, so I'm not sure if I really made the right choice there.
I originally tried to make it a free function, but keeping `clusterStride`, `clusterSize` and `subgroupSize` as variables in scope was more convenient. Maybe it's a bit too weird?
https://github.com/llvm/llvm-project/pull/107142
More information about the Mlir-commits
mailing list