[Mlir-commits] [mlir] [mlir][amdgpu] Promote gpu.shuffle to amdgpu.dpp (PR #155158)
Tim Gymnich
llvmlistbot at llvm.org
Sun Aug 24 04:13:19 PDT 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/155158
>From d83016555a199dd2bbdcfe9c9b13fe80137c9edb Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 07:45:05 -0500
Subject: [PATCH 1/2] PromoteShuffleToDPPPattern
---
.../GPU/Transforms/PromoteShuffleToAMDGPU.cpp | 93 ++++++++++++++++++-
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 38 ++++++++
2 files changed, 129 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
index 67cef8af1e3b5..4144846e10934 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
@@ -85,7 +85,8 @@ struct PromoteShuffleToPermlanePattern
int64_t offsetValue = *offset;
if (offsetValue != 16 && offsetValue != 32)
- return rewriter.notifyMatchFailure(op, "offset must be either 15 or 31");
+ return rewriter.notifyMatchFailure(op, "offset must be either 16 or 32");
+
Location loc = op.getLoc();
Value res = amdgpu::PermlaneSwapOp::create(
@@ -96,13 +97,101 @@ struct PromoteShuffleToPermlanePattern
}
};
+/// Try to promote `gpu.shuffle` to `amdgpu.dpp`, width must be 64
+/// and offset must be a constant integer in the set {16, 32}.
+struct PromoteShuffleToDPPPattern : public OpRewritePattern<gpu::ShuffleOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(gpu::ShuffleOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<int64_t> width = getConstantIntValue(op.getWidth());
+ if (!width)
+ return rewriter.notifyMatchFailure(op,
+ "width must be a constant integer");
+ int64_t widthValue = *width;
+ if (widthValue != 4 && widthValue != 8 && widthValue != 12 &&
+ widthValue != 16 && widthValue != 32 && widthValue != 48 &&
+ widthValue != 64)
+ return rewriter.notifyMatchFailure(
+ op, "width must be 4, 8, 12, 16, 32, 48 or 64");
+
+ std::optional<int64_t> offset = getConstantIntValue(op.getOffset());
+ if (!offset)
+ return rewriter.notifyMatchFailure(op,
+ "offset must be a constant integer");
+
+ int64_t offsetValue = *offset;
+ Attribute permAttr = rewriter.getUnitAttr();
+
+ amdgpu::DPPPerm kind;
+ switch (op.getMode()) {
+ case gpu::ShuffleMode::XOR: {
+ if (offsetValue != 1 && offsetValue != 2)
+ return rewriter.notifyMatchFailure(
+ op, "xor shuffle mode is only supported for offsets of 1 or 2");
+ kind = amdgpu::DPPPerm::quad_perm;
+ if (offsetValue == 1)
+ permAttr = rewriter.getI32ArrayAttr({1, 0, 3, 2});
+ else if (offsetValue == 2)
+ permAttr = rewriter.getI32ArrayAttr({2, 3, 0, 1});
+ break;
+ }
+ case gpu::ShuffleMode::UP: {
+ if (offsetValue != 1)
+ return rewriter.notifyMatchFailure(
+ op, "up shuffle mode is only supported for offset 1");
+ kind = amdgpu::DPPPerm::wave_shr;
+ break;
+ }
+ case gpu::ShuffleMode::DOWN: {
+ if (offsetValue != 1)
+ return rewriter.notifyMatchFailure(
+ op, "down shuffle mode is only supported for offset 1");
+ kind = amdgpu::DPPPerm::wave_shl;
+ break;
+ }
+ case gpu::ShuffleMode::IDX:
+ return rewriter.notifyMatchFailure(op,
+ "idx shuffle mode is not supported");
+ }
+
+ unsigned bankMask = 0xF;
+ if (width == 4)
+ bankMask = 0x1;
+ else if (width == 8)
+ bankMask = 0x3;
+ else if (width == 12)
+ bankMask = 0x7;
+
+ unsigned rowMask = 0xF;
+ if (width == 16)
+ rowMask = 0x1;
+ else if (width == 32)
+ rowMask = 0x3;
+ else if (width == 48)
+ rowMask = 0x7;
+
+ constexpr bool boundCtrl = false;
+
+ Location loc = op.getLoc();
+ Value dpp = amdgpu::DPPOp::create(rewriter, loc, op.getResult(0).getType(),
+ op.getValue(), op.getValue(), kind,
+ permAttr, rowMask, bankMask, boundCtrl);
+ Value valid = arith::ConstantIntOp::create(rewriter, loc, 1, /*width*/ 1);
+ rewriter.replaceOp(op, {dpp, valid});
+ return success();
+ }
+};
+
} // namespace
void mlir::populateGpuPromoteShuffleToAMDGPUPatterns(
RewritePatternSet &patterns, std::optional<amdgpu::Chipset> maybeChipset) {
patterns.add<PromoteShuffleToSwizzlePattern>(patterns.getContext(),
/*benefit*/ 1);
+ patterns.add<PromoteShuffleToDPPPattern>(patterns.getContext(),
+ /*benefit*/ 2);
if (maybeChipset && *maybeChipset >= kGfx950)
patterns.add<PromoteShuffleToPermlanePattern>(patterns.getContext(),
- /*benefit*/ 2);
+ /*benefit*/ 3);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 71c3e9974611e..a0a61fd689e77 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -760,6 +760,44 @@ gpu.module @test_module {
func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
}
+ // CHECK-LABEL: func @gpu_butterfly_shuffle()
+ func.func @gpu_butterfly_shuffle() -> (f32, f32, f32, f32, f32, f32) {
+ // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+ %arg0 = arith.constant 1.0 : f32
+ %c1 = arith.constant 1 : i32
+ %c2 = arith.constant 2 : i32
+ %c4 = arith.constant 4 : i32
+ %c8 = arith.constant 8 : i32
+ %c16 = arith.constant 16 : i32
+ %c32 = arith.constant 32 : i32
+ %c64 = arith.constant 64 : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 177, 15, 15, false : f32
+ %shfl1, %pred1 = gpu.shuffle xor %arg0, %c1, %c64 : f32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 78, 15, 15, false : f32
+ %shfl2, %pred2 = gpu.shuffle xor %arg0, %c2, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+ %shfl3, %pred3 = gpu.shuffle xor %arg0, %c4, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(8223 : i32) : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+ %shfl4, %pred4 = gpu.shuffle xor %arg0, %c8, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane16.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+ // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+ %shfl5, %pred5 = gpu.shuffle xor %arg0, %c16, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane32.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+ // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+ %shfl6, %pred6 = gpu.shuffle xor %arg0, %c32, %c64 : f32
+ func.return %shfl1, %shfl2, %shfl3, %shfl4, %shfl5, %shfl6 : f32, f32, f32, f32, f32, f32
+ }
+
// CHECK-LABEL: func @gpu_shuffle_vec
// CHECK-SAME: (%[[ARG:.*]]: vector<4xf16>, %{{.*}}: i32, %{{.*}}: i32)
func.func @gpu_shuffle_vec(%arg0: vector<4xf16>, %arg1: i32, %arg2: i32) -> vector<4xf16> {
>From 169765b3118a7f8e76680245102db404ab796836 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Sun, 24 Aug 2025 06:13:06 -0500
Subject: [PATCH 2/2] add tests
---
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index a0a61fd689e77..13fc870d71597 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -735,13 +735,14 @@ gpu.module @test_module {
}
// CHECK-LABEL: func @gpu_shuffle_promote()
- func.func @gpu_shuffle_promote() -> (f32, f32, f32) {
+ func.func @gpu_shuffle_promote() -> (f32, f32, f32, f32, f32) {
// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
%arg0 = arith.constant 1.0 : f32
%arg1 = arith.constant 4 : i32
%arg2 = arith.constant 16 : i32
%arg3 = arith.constant 32 : i32
%arg4 = arith.constant 64 : i32
+ %arg5 = arith.constant 1 : i32
// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
// CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
// CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
@@ -757,7 +758,11 @@ gpu.module @test_module {
// CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
%shfl3, %pred3 = gpu.shuffle xor %arg0, %arg3, %arg4 : f32
- func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 312, 1, 15, false : f32
+ %shflu, %predu = gpu.shuffle up %arg0, %arg5, %arg2 : f32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 304, 1, 15, false : f32
+ %shfld, %predd = gpu.shuffle down %arg0, %arg5, %arg2 : f32
+ func.return %shfl1, %shfl2, %shfl3, %shflu, %shfld : f32, f32, f32, f32, f32
}
// CHECK-LABEL: func @gpu_butterfly_shuffle()
More information about the Mlir-commits
mailing list