[Mlir-commits] [mlir] [mlir][AMDGPU] Implement gpu.subgroup_reduce with DPP intrinsics on AMD GPUs (PR #133204)
Krzysztof Drewniak
llvmlistbot at llvm.org
Wed Apr 16 13:53:11 PDT 2025
================
@@ -362,6 +366,164 @@ struct VectorSubgroupReduceToShuffles final
unsigned shuffleBitwidth = 0;
bool matchClustered = false;
};
+
+std::optional<Value> createSubgroupDPPReduction(OpBuilder &b, Location loc,
+ Value input,
+ gpu::AllReduceOperation mode,
+ const ClusterInfo &ci,
+ amdgpu::Chipset chipset) {
+ Value result = input;
+ constexpr int allRows = 0xf;
+ constexpr int allBanks = 0xf;
+ const bool boundCtrl = true;
+ Value lane0 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(0));
+ Value lane32 =
+ b.create<arith::ConstantOp>(loc, b.getI32Type(), b.getI32IntegerAttr(32));
+
+ auto dppReduceAcrossLanes = [&](int numLanes,
+ Value res) -> std::optional<Value> {
+ Value dppResult, laneVal;
+
+ switch (numLanes) {
+ case 2:
+ // Perform reduction between all lanes N <-> N+1.
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+ b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+ break;
+ case 4:
+ // Perform reduction between all lanes N <-> N+2.
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+ b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+ break;
+ case 8:
+ // Perform reduction between all lanes N <-> 7-N,
+ // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ break;
+ case 16:
+ // Perform reduction between all lanes N <-> 15-N,
+ // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+ dppResult = b.create<amdgpu::DPPOp>(
+ loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+ b.getUnitAttr(), allRows, allBanks, boundCtrl);
+ break;
+ case 32:
+ if (chipset.majorVersion <= 9) {
+ // Broadcast last value from each row to next row.
+ // Use row mask to avoid polluting rows 1 and 3.
+ dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+ amdgpu::DPPPerm::row_bcast_15,
+ b.getUnitAttr(), 0xa, allBanks,
+ /*bound_ctrl*/ false);
+ } else if (chipset.majorVersion <= 12) {
+ // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+ dppResult = b.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+ -1, -1, /*fi=*/true,
+ /*bound_ctrl=*/false);
+ if (ci.subgroupSize == 32) {
+ dppResult =
+ b.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+ }
+ } else {
+ return std::nullopt;
+ }
+ break;
+ case 64:
+ if (chipset.majorVersion <= 9) {
+ // Broadcast 31st lane value to rows 2 and 3.
+ // Use row mask to avoid polluting rows 0 and 1.
+ dppResult = b.create<amdgpu::DPPOp>(loc, res.getType(), res, res,
+ amdgpu::DPPPerm::row_bcast_31,
+ b.getUnitAttr(), 0xc, allBanks,
+ /*bound_ctrl*/ false);
+ } else if (chipset.majorVersion <= 12) {
----------------
krzysz00 wrote:
I'd either leave this unbounded or add some sort of message for when someone inevitably runs this code on gfx13
https://github.com/llvm/llvm-project/pull/133204
More information about the Mlir-commits
mailing list