[Mlir-commits] [mlir] [mlir][amdgpu] Promote gpu.shuffle to amdgpu.dpp (PR #155158)
Tim Gymnich
llvmlistbot at llvm.org
Thu Oct 30 14:21:18 PDT 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/155158
>From 65a906590ecd126db68f1b1e29595657d18524dc Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 07:45:05 -0500
Subject: [PATCH] PromoteShuffleToDPPPattern
---
.../GPU/Transforms/PromoteShuffleToAMDGPU.cpp | 149 +++++++++++++++++-
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 86 +++++++++-
2 files changed, 228 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
index 67cef8af1e3b5..01a6c93965f96 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
@@ -11,12 +11,13 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
-
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/PatternMatch.h"
#include <optional>
@@ -85,7 +86,7 @@ struct PromoteShuffleToPermlanePattern
int64_t offsetValue = *offset;
if (offsetValue != 16 && offsetValue != 32)
- return rewriter.notifyMatchFailure(op, "offset must be either 15 or 31");
+ return rewriter.notifyMatchFailure(op, "offset must be either 16 or 32");
Location loc = op.getLoc();
Value res = amdgpu::PermlaneSwapOp::create(
@@ -96,13 +97,151 @@ struct PromoteShuffleToPermlanePattern
}
};
+static Value getLaneId(RewriterBase &rewriter, Location loc) {
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
+ Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
+ NamedAttribute noundef = {LLVM::LLVMDialect::getNoUndefAttrName(),
+ rewriter.getUnitAttr()};
+ NamedAttribute lowRange = {LLVM::LLVMDialect::getRangeAttrName(),
+ LLVM::ConstantRangeAttr::get(rewriter.getContext(),
+ APInt::getZero(32),
+ APInt(32, 32))};
+ NamedAttribute highRange = {
+ LLVM::LLVMDialect::getRangeAttrName(),
+ LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
+ APInt(32, 64))};
+ Value mbcntLo = ROCDL::MbcntLoOp::create(
+ rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
+ /*res_attrs=*/
+ rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
+ Value laneId = ROCDL::MbcntHiOp::create(
+ rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
+ rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
+ return laneId;
+}
+
+/// Try to promote `gpu.shuffle` to `amdgpu.dpp`, width must be 64
+/// and offset must be a constant integer in the set {16, 32}.
+struct PromoteShuffleToDPPPattern : public OpRewritePattern<gpu::ShuffleOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(gpu::ShuffleOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<int64_t> width = getConstantIntValue(op.getWidth());
+ if (!width)
+ return rewriter.notifyMatchFailure(op,
+ "width must be a constant integer");
+ int64_t widthValue = *width;
+ if (!llvm::is_contained({4, 8, 12, 16, 32, 48, 64}, widthValue))
+ return rewriter.notifyMatchFailure(
+ op, "width must be 4, 8, 12, 16, 32, 48 or 64");
+
+ std::optional<int64_t> offset = getConstantIntValue(op.getOffset());
+ if (!offset)
+ return rewriter.notifyMatchFailure(op,
+ "offset must be a constant integer");
+
+ int64_t offsetValue = *offset;
+ Location loc = op.getLoc();
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+
+ amdgpu::DPPPerm kind;
+ Attribute permAttr = rewriter.getUnitAttr();
+ Value srcLane;
+ Value dstLane;
+ switch (op.getMode()) {
+ case gpu::ShuffleMode::XOR: {
+ if (offsetValue != 1 && offsetValue != 2)
+ return rewriter.notifyMatchFailure(
+ op, "xor shuffle mode is only supported for offsets of 1 or 2");
+ kind = amdgpu::DPPPerm::quad_perm;
+ srcLane = getLaneId(rewriter, loc);
+ dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLane,
+ op.getOffset());
+
+ if (offsetValue == 1)
+ permAttr = rewriter.getI32ArrayAttr({1, 0, 3, 2});
+ else if (offsetValue == 2)
+ permAttr = rewriter.getI32ArrayAttr({2, 3, 0, 1});
+ break;
+ }
+ case gpu::ShuffleMode::UP: {
+ if (offsetValue != 1)
+ return rewriter.notifyMatchFailure(
+ op, "up shuffle mode is only supported for offset 1");
+ kind = amdgpu::DPPPerm::wave_shr;
+ srcLane = getLaneId(rewriter, loc);
+ dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLane,
+ op.getOffset());
+ break;
+ }
+ case gpu::ShuffleMode::DOWN: {
+ if (offsetValue != 1)
+ return rewriter.notifyMatchFailure(
+ op, "down shuffle mode is only supported for offset 1");
+ kind = amdgpu::DPPPerm::wave_shl;
+ srcLane = getLaneId(rewriter, loc);
+ dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLane,
+ op.getOffset());
+ break;
+ }
+ case gpu::ShuffleMode::IDX:
+ return rewriter.notifyMatchFailure(op,
+ "idx shuffle mode is not supported");
+ }
+
+ unsigned bankMask = 0xF;
+ if (widthValue == 4)
+ bankMask = 0x1;
+ else if (widthValue == 8)
+ bankMask = 0x3;
+ else if (widthValue == 12)
+ bankMask = 0x7;
+
+ unsigned rowMask = 0xF;
+ if (widthValue == 16)
+ rowMask = 0x1;
+ else if (widthValue == 32)
+ rowMask = 0x3;
+ else if (widthValue == 48)
+ rowMask = 0x7;
+
+ constexpr bool boundCtrl = false;
+
+ Value negwidth =
+ arith::ConstantIntOp::create(rewriter, loc, int32Type, -widthValue);
+ Value add =
+ arith::AddIOp::create(rewriter, loc, int32Type, srcLane, op.getWidth());
+ Value widthOrZeroIfOutside =
+ arith::AndIOp::create(rewriter, loc, int32Type, add, negwidth);
+ Value isActiveSrcLane =
+ arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt, dstLane,
+ widthOrZeroIfOutside);
+
+ Value dpp = amdgpu::DPPOp::create(rewriter, loc, op.getResult(0).getType(),
+ op.getValue(), op.getValue(), kind,
+ permAttr, rowMask, bankMask, boundCtrl);
+ Value poison =
+ LLVM::PoisonOp::create(rewriter, loc, op.getResult(0).getType());
+
+ Value selectResult =
+ arith::SelectOp::create(rewriter, loc, isActiveSrcLane, dpp, poison);
+
+ rewriter.replaceOp(op, {selectResult, isActiveSrcLane});
+ return success();
+ }
+};
+
} // namespace
void mlir::populateGpuPromoteShuffleToAMDGPUPatterns(
RewritePatternSet &patterns, std::optional<amdgpu::Chipset> maybeChipset) {
patterns.add<PromoteShuffleToSwizzlePattern>(patterns.getContext(),
/*benefit*/ 1);
+ patterns.add<PromoteShuffleToDPPPattern>(patterns.getContext(),
+ /*benefit*/ 2);
if (maybeChipset && *maybeChipset >= kGfx950)
patterns.add<PromoteShuffleToPermlanePattern>(patterns.getContext(),
- /*benefit*/ 2);
+ /*benefit*/ 3);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index ef631ce8a12e5..5b5caf3dc0e8f 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -735,13 +735,18 @@ gpu.module @test_module {
}
// CHECK-LABEL: func @gpu_shuffle_promote()
- func.func @gpu_shuffle_promote() -> (f32, f32, f32) {
+ func.func @gpu_shuffle_promote() -> (f32, f32, f32, f32, f32) {
+ // CHECK: %[[#POISON:]] = llvm.mlir.poison : f32
+ // CHECK: %[[#NEGWIDTH:]] = llvm.mlir.constant(-64 : i32) : i32
// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
%arg0 = arith.constant 1.0 : f32
%arg1 = arith.constant 4 : i32
%arg2 = arith.constant 16 : i32
%arg3 = arith.constant 32 : i32
+ // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(64 : i32) : i32
%arg4 = arith.constant 64 : i32
+ // CHECK: %[[#C1:]] = llvm.mlir.constant(1 : i32) : i32
+ %arg5 = arith.constant 1 : i32
// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
// CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
// CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
@@ -763,7 +768,84 @@ gpu.module @test_module {
// CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
%shfl3, %pred3 = gpu.shuffle xor %arg0, %arg3, %arg4 : f32
- func.return %shfl1, %shfl2, %shfl3 : f32, f32, f32
+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+ // CHECK: %[[#SUB:]] = llvm.sub %[[#LANE_ID]], %[[#C1]] : i32
+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#SUB]], %[[#AND]] : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 312, 15, 15, false : f32
+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
+ %shflu, %predu = gpu.shuffle up %arg0, %arg5, %arg4 : f32
+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+ // CHECK: %[[#OP:]] = llvm.add %[[#LANE_ID]], %[[#C1]] : i32
+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#OP]], %[[#AND]] : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 304, 15, 15, false : f32
+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
+ %shfld, %predd = gpu.shuffle down %arg0, %arg5, %arg4 : f32
+ func.return %shfl1, %shfl2, %shfl3, %shflu, %shfld : f32, f32, f32, f32, f32
+ }
+
+ // CHECK-LABEL: func @gpu_butterfly_shuffle()
+ func.func @gpu_butterfly_shuffle() -> (f32, f32, f32, f32, f32, f32) {
+ // CHECK: %[[#POISON:]] = llvm.mlir.poison : f32
+ // CHECK: %[[#NEGWIDTH:]] = llvm.mlir.constant(-64 : i32) : i32
+ // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+ %arg0 = arith.constant 1.0 : f32
+ // CHECK: %[[#C1:]] = llvm.mlir.constant(1 : i32) : i32
+ %c1 = arith.constant 1 : i32
+ // CHECK: %[[#C2:]] = llvm.mlir.constant(2 : i32) : i32
+ %c2 = arith.constant 2 : i32
+ %c4 = arith.constant 4 : i32
+ %c8 = arith.constant 8 : i32
+ %c16 = arith.constant 16 : i32
+ %c32 = arith.constant 32 : i32
+ // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(64 : i32) : i32
+ %c64 = arith.constant 64 : i32
+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+ // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %[[#C1]] : i32
+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#XOR]], %[[#AND]] : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 177, 15, 15, false : f32
+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
+ %shfl1, %pred1 = gpu.shuffle xor %arg0, %c1, %c64 : f32
+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+ // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %[[#C2]] : i32
+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#XOR]], %[[#AND]] : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 78, 15, 15, false : f32
+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
+ %shfl2, %pred2 = gpu.shuffle xor %arg0, %c2, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+ %shfl3, %pred3 = gpu.shuffle xor %arg0, %c4, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(8223 : i32) : i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+ %shfl4, %pred4 = gpu.shuffle xor %arg0, %c8, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane16.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+ // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
+ // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
+ %shfl5, %pred5 = gpu.shuffle xor %arg0, %c16, %c64 : f32
+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane32.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
+ // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
+ // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
+ // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
+ %shfl6, %pred6 = gpu.shuffle xor %arg0, %c32, %c64 : f32
+ func.return %shfl1, %shfl2, %shfl3, %shfl4, %shfl5, %shfl6 : f32, f32, f32, f32, f32, f32
}
// CHECK-LABEL: func @gpu_shuffle_vec
More information about the Mlir-commits
mailing list