[Mlir-commits] [mlir] [AMDGPU] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)

Tue Apr 15 08:14:56 PDT 2025

================
@@ -362,6 +365,131 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci,
+                                 amdgpu::Chipset chipset) {
+  Value dppResult;
+  Value result = input;
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+  const bool boundCtrl = true;
+  Value lane31 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(31));
+  Value lane63 =
+      b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(63));
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getI32ArrayAttr({1, 0, 3, 2});
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::quad_perm, permArg,
+                                        allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getI32ArrayAttr({2, 3, 0, 1});
+    dppResult = b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                        amdgpu::DPPPerm::quad_perm, permArg,
+                                        allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr(), allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror,
+        b.getUnitAttr(), allRows, allBanks, boundCtrl);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      dppResult = b.create<amdgpu::DPPOp>(
+          loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+          b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false);
+    } else if (chipset.majorVersion >= 10) {
+      Value uIntMaxConst = b.create<arith::ConstantOp>(loc, b.getI32Type(),
+                                                       b.getI32IntegerAttr(-1));
+      dppResult = b.create<ROCDL::PermlaneX16Op>(
+          loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst,
+          true, false);
+    }
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    if (ci.subgroupSize == 32) {
+      result =
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane31);
+    }
+  }
+
+  if (ci.clusterSize == 64) {
+    dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
----------------
krzysz00 wrote:

What prevents ci.clusterSize > 64? Is that condition checked elsewhere?

https://github.com/llvm/llvm-project/pull/133204