[Mlir-commits] [mlir] [mlir][gpu] Add 'cluster_stride' attribute to gpu.subgroup_reduce (PR #107142)

Wed Sep 4 04:08:13 PDT 2024

================
@@ -140,44 +137,75 @@ struct ScalarizeSingleElementReduce final
     Location loc = op.getLoc();
     Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
     Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-        loc, extracted, op.getOp(), op.getUniform(), clusterSize);
+        loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
+        op.getClusterStride());
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
     return success();
   }
 };
 
-/// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
-/// and `unpackFn` to convert to the native shuffle type and to the reduction
-/// type, respectively. For example, with `input` of type `f16`, `packFn` could
-/// build ops to cast the value to `i32` to perform shuffles, while `unpackFn`
-/// would cast it back to `f16` to perform arithmetic reduction on. Assumes that
-/// the subgroup is `subgroupSize` lanes wide and divides it into clusters of
-/// `clusterSize` lanes, reducing all lanes in each cluster in parallel.
-static Value createSubgroupShuffleReduction(
-    OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode,
-    unsigned clusterSize, unsigned subgroupSize,
-    function_ref<Value(Value)> packFn, function_ref<Value(Value)> unpackFn) {
-  assert(llvm::isPowerOf2_32(clusterSize));
-  assert(llvm::isPowerOf2_32(subgroupSize));
-  assert(clusterSize <= subgroupSize);
-  // Lane value always stays in the original type. We use it to perform arith
-  // reductions.
-  Value laneVal = input;
-  // Parallel reduction using butterfly shuffles.
-  for (unsigned i = 1; i < clusterSize; i <<= 1) {
-    Value shuffled = builder
-                         .create<gpu::ShuffleOp>(loc, packFn(laneVal), i,
-                                                 /*width=*/subgroupSize,
-                                                 /*mode=*/gpu::ShuffleMode::XOR)
-                         .getShuffleResult();
-    laneVal = vector::makeArithReduction(builder, loc,
-                                         gpu::convertReductionKind(mode),
-                                         laneVal, unpackFn(shuffled));
-    assert(laneVal.getType() == input.getType());
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+  LogicalResult getAndValidate(gpu::SubgroupReduceOp op,
+                               unsigned subgroupSize) {
----------------
andfau-amd wrote:

I've reworked this now, hopefully it's more to your liking. I do think the new way is probably clearer.

https://github.com/llvm/llvm-project/pull/107142