[Mlir-commits] [mlir] [mlir] GPUToROCDL: lower `gpu.subgroup_id` to the intrinsic where possible (PR #179422)
Ivan Butygin
llvmlistbot at llvm.org
Tue Feb 3 06:03:27 PST 2026
https://github.com/Hardcode84 updated https://github.com/llvm/llvm-project/pull/179422
>From 4e4adadee097b907b222389070ba6012bb2624cd Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Tue, 3 Feb 2026 11:03:37 +0100
Subject: [PATCH] [mlir] GPUToROCDL: lower gpu.subgroup_id to the intrinsic
where possible
Lower `gpu.subgroup_id` to `wave.id` intrinsic on gfx12+, lower to `linearized_thread_id / subgroup_size` on older.
---
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 60 ++++++++++++++++++-
.../GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir | 23 +++++++
2 files changed, 81 insertions(+), 2 deletions(-)
create mode 100644 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 69a83468cfa84..f09e56a02eeab 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -170,6 +170,62 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
const amdgpu::Chipset chipset;
};
+struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ GPUSubgroupIdOpToROCDL(const LLVMTypeConverter &converter,
+ amdgpu::Chipset chipset)
+ : ConvertOpToLLVMPattern<gpu::SubgroupIdOp>(converter), chipset(chipset) {
+ }
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ auto int32Type = rewriter.getI32Type();
+
+ Value subgroupId;
+ if (chipset.majorVersion >= 12) {
+ // For gfx12+, use the hardware wave.id register directly.
+ subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type);
+ } else {
+ // For older architectures, compute:
+ // subgroup_id = linearized_thread_id / subgroup_size
+ // where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z)
+ Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
+ Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
+ Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
+ Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type);
+ Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type);
+
+ // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
+ // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
+ auto flags =
+ LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+ Value dimYxTidZ =
+ LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
+ Value tidYPlusDimYxTidZ =
+ LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
+ Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
+ tidYPlusDimYxTidZ, flags);
+ Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
+ dimXxInner, flags);
+
+ Value subgroupSize =
+ ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
+ subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
+ subgroupSize);
+ }
+
+ subgroupId =
+ truncOrExtToLLVMType(rewriter, loc, subgroupId, *getTypeConverter());
+ rewriter.replaceOp(op, subgroupId);
+ return success();
+ }
+
+ const amdgpu::Chipset chipset;
+};
+
static bool isSupportedReadLaneType(Type type) {
// https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
@@ -586,8 +642,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
GPUSubgroupBroadcastOpToROCDL>(converter);
- patterns.add<GPUSubgroupSizeOpToROCDL, GPUBarrierOpLowering>(converter,
- chipset);
+ patterns.add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
+ GPUBarrierOpLowering>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns, chipset);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
new file mode 100644
index 0000000000000..c5bb1c5929ab9
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx942' | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx1201' | FileCheck %s --check-prefixes=CHECK,GFX12
+
+gpu.module @test_module {
+// CHECK-LABEL: func @subgroup_id()
+func.func @subgroup_id() -> index {
+ // GFX12: rocdl.wave.id : i32
+ // GFX12: llvm.sext %{{.*}} : i32 to i64
+
+ // GFX9-DAG: rocdl.workitem.id.x : i32
+ // GFX9-DAG: rocdl.workitem.id.y : i32
+ // GFX9-DAG: rocdl.workitem.id.z : i32
+ // GFX9-DAG: rocdl.workgroup.dim.x : i32
+ // GFX9-DAG: rocdl.workgroup.dim.y : i32
+ // GFX9-DAG: llvm.mul %{{.*}}, %{{.*}} overflow<nsw, nuw>
+ // GFX9-DAG: llvm.add %{{.*}}, %{{.*}} overflow<nsw, nuw>
+ // GFX9: rocdl.wavefrontsize : i32
+ // GFX9: llvm.udiv
+ // GFX9: llvm.sext %{{.*}} : i32 to i64
+ %subgroupId = gpu.subgroup_id : index
+ func.return %subgroupId : index
+}
+}
More information about the Mlir-commits
mailing list