[Mlir-commits] [mlir] [mlir][AMDGPU] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)

Tue Apr 22 06:48:44 PDT 2025

================
@@ -362,6 +366,163 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+FailureOr<Value>
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
+                           Value input, gpu::AllReduceOperation mode,
+                           const ClusterInfo &ci, amdgpu::Chipset chipset) {
+  Location loc = op.getLoc();
+  Value dpp;
+  Value res = input;
+  constexpr int allRows = 0xf;
+  constexpr int allBanks = 0xf;
+  const bool boundCtrl = true;
+  if (ci.clusterSize >= 2) {
+    // Perform reduction between all lanes N <-> N+1.
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+
+  if (ci.clusterSize >= 4) {
+    // Perform reduction between all lanes N <-> N+2.
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 8) {
+    // Perform reduction between all lanes N <-> 7-N,
+    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 16) {
+    // Perform reduction between all lanes N <-> 15-N,
+    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast last value from each row to next row.
+      // Use row mask to avoid polluting rows 1 and 3.
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+          rewriter.getUnitAttr(), 0xa, allBanks,
+          /*bound_ctrl*/ false);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+    } else if (chipset.majorVersion <= 12) {
+      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+      Value uint32Max = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                  uint32Max, uint32Max,
+                                                  /*fi=*/true,
+                                                  /*bound_ctrl=*/false);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+      if (ci.subgroupSize == 32) {
----------------
krzysz00 wrote:

Can you check the nesting on the if statements here and in the >= 64 case? For example unless I missed it, this value of `dpp` is never used

https://github.com/llvm/llvm-project/pull/133204