[Mlir-commits] [mlir] [mlir][amdgpu] Promote gpu.shuffle to amdgpu.dpp (PR #155158)

Sun Aug 24 04:15:52 PDT 2025

https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/155158

>From d83016555a199dd2bbdcfe9c9b13fe80137c9edb Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 07:45:05 -0500
Subject: [PATCH 1/3] PromoteShuffleToDPPPattern

---
 .../GPU/Transforms/PromoteShuffleToAMDGPU.cpp | 93 ++++++++++++++++++-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 38 ++++++++
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
index 67cef8af1e3b5..4144846e10934 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
@@ -85,7 +85,8 @@ struct PromoteShuffleToPermlanePattern
 
     int64_t offsetValue = *offset;
     if (offsetValue != 16 && offsetValue != 32)
-      return rewriter.notifyMatchFailure(op, "offset must be either 15 or 31");
+      return rewriter.notifyMatchFailure(op, "offset must be either 16 or 32");
+
 
     Location loc = op.getLoc();
     Value res = amdgpu::PermlaneSwapOp::create(
@@ -96,13 +97,101 @@ struct PromoteShuffleToPermlanePattern
   }
 };
 
+/// Try to promote `gpu.shuffle` to `amdgpu.dpp`, width must be 64
+/// and offset must be a constant integer in the set {16, 32}.
+struct PromoteShuffleToDPPPattern : public OpRewritePattern<gpu::ShuffleOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gpu::ShuffleOp op,
+                                PatternRewriter &rewriter) const override {
+    std::optional<int64_t> width = getConstantIntValue(op.getWidth());
+    if (!width)
+      return rewriter.notifyMatchFailure(op,
+                                         "width must be a constant integer");
+    int64_t widthValue = *width;
+    if (widthValue != 4 && widthValue != 8 && widthValue != 12 &&
+        widthValue != 16 && widthValue != 32 && widthValue != 48 &&
+        widthValue != 64)
+      return rewriter.notifyMatchFailure(
+          op, "width must be 4, 8, 12, 16, 32, 48 or 64");
+
+    std::optional<int64_t> offset = getConstantIntValue(op.getOffset());
+    if (!offset)
+      return rewriter.notifyMatchFailure(op,
+                                         "offset must be a constant integer");
+
+    int64_t offsetValue = *offset;
+    Attribute permAttr = rewriter.getUnitAttr();
+
+    amdgpu::DPPPerm kind;
+    switch (op.getMode()) {
+    case gpu::ShuffleMode::XOR: {
+      if (offsetValue != 1 && offsetValue != 2)
+        return rewriter.notifyMatchFailure(
+            op, "xor shuffle mode is only supported for offsets of 1 or 2");
+      kind = amdgpu::DPPPerm::quad_perm;
+      if (offsetValue == 1)
+        permAttr = rewriter.getI32ArrayAttr({1, 0, 3, 2});
+      else if (offsetValue == 2)
+        permAttr = rewriter.getI32ArrayAttr({2, 3, 0, 1});
+      break;
+    }
+    case gpu::ShuffleMode::UP: {
+      if (offsetValue != 1)
+        return rewriter.notifyMatchFailure(
+            op, "up shuffle mode is only supported for offset 1");
+      kind = amdgpu::DPPPerm::wave_shr;
+      break;
+    }
+    case gpu::ShuffleMode::DOWN: {
+      if (offsetValue != 1)
+        return rewriter.notifyMatchFailure(
+            op, "down shuffle mode is only supported for offset 1");
+      kind = amdgpu::DPPPerm::wave_shl;
+      break;
+    }
+    case gpu::ShuffleMode::IDX:
+      return rewriter.notifyMatchFailure(op,
+                                         "idx shuffle mode is not supported");
+    }
+
+    unsigned bankMask = 0xF;
+    if (width == 4)
+      bankMask = 0x1;
+    else if (width == 8)
+      bankMask = 0x3;
+    else if (width == 12)
+      bankMask = 0x7;
+
+    unsigned rowMask = 0xF;
+    if (width == 16)
+      rowMask = 0x1;
+    else if (width == 32)
+      rowMask = 0x3;
+    else if (width == 48)
+      rowMask = 0x7;
+
+    constexpr bool boundCtrl = false;
+
+    Location loc = op.getLoc();
+    Value dpp = amdgpu::DPPOp::create(rewriter, loc, op.getResult(0).getType(),
+                                      op.getValue(), op.getValue(), kind,
+                                      permAttr, rowMask, bankMask, boundCtrl);
+    Value valid = arith::ConstantIntOp::create(rewriter, loc, 1, /*width*/ 1);
+    rewriter.replaceOp(op, {dpp, valid});
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::populateGpuPromoteShuffleToAMDGPUPatterns(
     RewritePatternSet &patterns, std::optional<amdgpu::Chipset> maybeChipset) {
   patterns.add<PromoteShuffleToSwizzlePattern>(patterns.getContext(),
                                                /*benefit*/ 1);
+  patterns.add<PromoteShuffleToDPPPattern>(patterns.getContext(),
+                                           /*benefit*/ 2);
   if (maybeChipset && *maybeChipset >= kGfx950)
     patterns.add<PromoteShuffleToPermlanePattern>(patterns.getContext(),
-                                                  /*benefit*/ 2);
+                                                  /*benefit*/ 3);
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 71c3e9974611e..a0a61fd689e77 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -760,6 +760,44 @@ gpu.module @test_module {
     func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
   }
 
+  // CHECK-LABEL: func @gpu_butterfly_shuffle()
+  func.func @gpu_butterfly_shuffle() -> (f32, f32, f32, f32, f32, f32) {
+    // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+    %arg0 = arith.constant 1.0 : f32
+    %c1 = arith.constant 1 : i32
+    %c2 = arith.constant 2 : i32
+    %c4 = arith.constant 4 : i32
+    %c8 = arith.constant 8 : i32
+    %c16 = arith.constant 16 : i32
+    %c32 = arith.constant 32 : i32
+    %c64 = arith.constant 64 : i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 177, 15, 15, false : f32
+    %shfl1, %pred1 = gpu.shuffle xor %arg0, %c1, %c64 : f32
+    // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 78, 15, 15, false : f32
+    %shfl2, %pred2 = gpu.shuffle xor %arg0, %c2, %c64 : f32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+    %shfl3, %pred3 = gpu.shuffle xor %arg0, %c4, %c64 : f32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#MASK:]] = llvm.mlir.constant(8223 : i32) : i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+    %shfl4, %pred4 = gpu.shuffle xor %arg0, %c8, %c64 : f32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.permlane16.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+    // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+    %shfl5, %pred5 = gpu.shuffle xor %arg0, %c16, %c64 : f32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.permlane32.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+    // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
+    %shfl6, %pred6 = gpu.shuffle xor %arg0, %c32, %c64 : f32
+    func.return %shfl1, %shfl2, %shfl3, %shfl4, %shfl5, %shfl6 : f32, f32, f32, f32, f32, f32
+  }
+
   // CHECK-LABEL: func @gpu_shuffle_vec
   //  CHECK-SAME: (%[[ARG:.*]]: vector<4xf16>, %{{.*}}: i32, %{{.*}}: i32)
   func.func @gpu_shuffle_vec(%arg0: vector<4xf16>, %arg1: i32, %arg2: i32) -> vector<4xf16> {

>From 169765b3118a7f8e76680245102db404ab796836 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Sun, 24 Aug 2025 06:13:06 -0500
Subject: [PATCH 2/3] add tests

---
 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index a0a61fd689e77..13fc870d71597 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -735,13 +735,14 @@ gpu.module @test_module {
   }
 
   // CHECK-LABEL: func @gpu_shuffle_promote()
-  func.func @gpu_shuffle_promote() -> (f32, f32, f32) {
+  func.func @gpu_shuffle_promote() -> (f32, f32, f32, f32, f32) {
     // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
     %arg0 = arith.constant 1.0 : f32
     %arg1 = arith.constant 4 : i32
     %arg2 = arith.constant 16 : i32
     %arg3 = arith.constant 32 : i32
     %arg4 = arith.constant 64 : i32
+    %arg5 = arith.constant 1 : i32
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
     // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
@@ -757,7 +758,11 @@ gpu.module @test_module {
     // CHECK: %[[#EXTRACT:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
     // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#EXTRACT]] : i32 to f32
     %shfl3, %pred3 = gpu.shuffle xor  %arg0, %arg3, %arg4 : f32
-    func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
+    // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 312, 1, 15, false : f32
+    %shflu, %predu = gpu.shuffle up  %arg0, %arg5, %arg2 : f32
+    // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 304, 1, 15, false : f32
+    %shfld, %predd = gpu.shuffle down %arg0, %arg5, %arg2 : f32
+    func.return %shfl1, %shfl2, %shfl3, %shflu, %shfld : f32, f32, f32, f32, f32
   }
 
   // CHECK-LABEL: func @gpu_butterfly_shuffle()

>From d176399693c4f6f4f48bc5107512bbfb1656c81b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Sun, 24 Aug 2025 06:15:40 -0500
Subject: [PATCH 3/3] clang format

---
 mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
index 4144846e10934..c4e09a0845b40 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
@@ -87,7 +87,6 @@ struct PromoteShuffleToPermlanePattern
     if (offsetValue != 16 && offsetValue != 32)
       return rewriter.notifyMatchFailure(op, "offset must be either 16 or 32");
 
-
     Location loc = op.getLoc();
     Value res = amdgpu::PermlaneSwapOp::create(
         rewriter, loc, op.getResult(0).getType(), op.getValue(), offsetValue);