[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)

Fri Apr 25 16:54:25 PDT 2025

https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/136405

>From 4c8b7169f4fefee8ae441bc843f13fc869d2345e Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sat, 19 Apr 2025 00:51:15 -0400
Subject: [PATCH] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL

---
 .../Conversion/GPUToROCDL/GPUToROCDLPass.h    |   7 +-
 mlir/include/mlir/Conversion/Passes.td        |   4 +
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 141 +++++++++++-------
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   |  22 +++
 4 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 291b809071ce9..1b265ecfc48e7 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,6 +10,7 @@
 
 #include "mlir/Conversion/GPUToROCDL/Runtimes.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include <memory>
 
 namespace mlir {
@@ -46,11 +47,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// index bitwidth used for the lowering of the device side index computations
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-createLowerGpuOpsToROCDLOpsPass(
-    const std::string &chipset = "gfx900",
-    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
-    bool useBarePtrCallConv = false,
-    gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
+createLowerGpuOpsToROCDLOpsPass();
 
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..a558aeffba5e4 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -608,6 +608,10 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
                clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
                clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
                           "OpenCL"))}]>,
+    Option<"subgroupSize", "subgroup-size", "unsigned",
+           "0",
+           "specify subgroup size for the kernel, if left empty, the default "
+           "value will be decided by the target chipset.">,
     ListOption<"allowedDialects", "allowed-dialects", "std::string",
                "Run conversion patterns of only the specified dialects">,
   ];
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 0a6fb6451f700..932063aa109ba 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,25 +52,6 @@ namespace mlir {
 
 using namespace mlir;
 
-// Truncate or extend the result depending on the index bitwidth specified
-// by the LLVMTypeConverter options.
-static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
-                                  Location loc, Value value,
-                                  const LLVMTypeConverter &converter) {
-  int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
-  int64_t indexBitwidth = converter.getIndexTypeBitwidth();
-  auto indexBitwidthType =
-      IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
-  // TODO: use <=> in C++20.
-  if (indexBitwidth > intWidth) {
-    return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
-  }
-  if (indexBitwidth < intWidth) {
-    return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
-  }
-  return value;
-}
-
 /// Returns true if the given `gpu.func` can be safely called using the bare
 /// pointer calling convention.
 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -99,6 +80,26 @@ static constexpr StringLiteral amdgcnDataLayout =
     "64-S32-A5-G1-ni:7:8:9";
 
 namespace {
+
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+                                  Location loc, Value value,
+                                  const LLVMTypeConverter &converter) {
+  int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
+  int64_t indexBitwidth = converter.getIndexTypeBitwidth();
+  auto indexBitwidthType =
+      IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
+  // TODO: use <=> in C++20.
+  if (indexBitwidth > intWidth) {
+    return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+  }
+  if (indexBitwidth < intWidth) {
+    return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+  }
+  return value;
+}
+
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
 
@@ -117,16 +118,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
         rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
     Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
         loc, intTy, ValueRange{minus1, mbcntLo});
-    // Truncate or extend the result depending on the index bitwidth specified
-    // by the LLVMTypeConverter options.
-    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
-    if (indexBitwidth > 32) {
-      laneId = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
-    } else if (indexBitwidth < 32) {
-      laneId = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
-    }
+    laneId = truncOrExtToLLVMType(rewriter, loc, laneId, *getTypeConverter());
     rewriter.replaceOp(op, {laneId});
     return success();
   }
@@ -150,11 +142,11 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
           /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
           /*upper=*/op.getUpperBoundAttr().getInt() + 1);
     }
-    Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
+    Value wavefrontSizeOp = rewriter.create<ROCDL::WavefrontSizeOp>(
         op.getLoc(), rewriter.getI32Type(), bounds);
-    wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
-                                       *getTypeConverter());
-    rewriter.replaceOp(op, {wavefrontOp});
+    wavefrontSizeOp = truncOrExtToLLVMType(
+        rewriter, op.getLoc(), wavefrontSizeOp, *getTypeConverter());
+    rewriter.replaceOp(op, {wavefrontSizeOp});
     return success();
   }
 
@@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 };
 
+struct GPUSubgroupIdOpToROCDL final
+    : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Calculation of the thread's subgroup identifier.
+    //
+    // The process involves mapping the thread's 3D identifier within its
+    // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+    // This linearization assumes a layout where the x-dimension (w_dim.x)
+    // varies most rapidly (i.e., it is the innermost dimension).
+    //
+    // The formula for the linearized thread index is:
+    // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+    //
+    // Subsequently, the range of linearized indices [0, N_threads-1] is
+    // divided into consecutive, non-overlapping segments, each representing
+    // a subgroup of size 'subgroup_size'.
+    //
+    // Example Partitioning (N = subgroup_size):
+    // | Subgroup 0      | Subgroup 1      | Subgroup 2      | ... |
+    // | Indices 0..N-1  | Indices N..2N-1 | Indices 2N..3N-1| ... |
+    //
+    // The subgroup identifier is obtained via integer division of the
+    // linearized thread index by the predefined 'subgroup_size'.
+    //
+    // subgroup_id = floor( L / subgroup_size )
+    //             = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+    //             subgroup_size
+    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+    Location loc = op.getLoc();
+    LLVM::IntegerOverflowFlags flags =
+        LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+    Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
+    Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
+    Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
+    Value workitemDimX = rewriter.create<ROCDL::BlockDimXOp>(loc, int32Type);
+    Value workitemDimY = rewriter.create<ROCDL::BlockDimYOp>(loc, int32Type);
+    Value dimYxIdZ = rewriter.create<LLVM::MulOp>(loc, int32Type, workitemDimY,
+                                                  workitemIdZ, flags);
+    Value dimYxIdZPlusIdY = rewriter.create<LLVM::AddOp>(
+        loc, int32Type, dimYxIdZ, workitemIdY, flags);
+    Value dimYxIdZPlusIdYTimesDimX = rewriter.create<LLVM::MulOp>(
+        loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags);
+    Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
+        rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
+                                     dimYxIdZPlusIdYTimesDimX, flags);
+    Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
+        loc, rewriter.getI32Type(), nullptr);
+    Value waveIdOp = rewriter.create<LLVM::UDivOp>(
+        loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
+    rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp,
+                                                 *getTypeConverter())});
+    return success();
+  }
+};
+
 /// Import the GPU Ops to ROCDL Patterns.
 #include "GPUToROCDL.cpp.inc"
 
@@ -249,19 +300,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
 // code.
 struct LowerGpuOpsToROCDLOpsPass final
     : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
-  LowerGpuOpsToROCDLOpsPass() = default;
-  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
-                            bool useBarePtrCallConv,
-                            gpu::amd::Runtime runtime) {
-    if (this->chipset.getNumOccurrences() == 0)
-      this->chipset = chipset;
-    if (this->indexBitwidth.getNumOccurrences() == 0)
-      this->indexBitwidth = indexBitwidth;
-    if (this->useBarePtrCallConv.getNumOccurrences() == 0)
-      this->useBarePtrCallConv = useBarePtrCallConv;
-    if (this->runtime.getNumOccurrences() == 0)
-      this->runtime = runtime;
-  }
+  using Base::Base;
 
   void getDependentDialects(DialectRegistry &registry) const override {
     Base::getDependentDialects(registry);
@@ -456,7 +495,7 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns
-      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupSizeOpToROCDL>(
+      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
           converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
@@ -464,10 +503,6 @@ void mlir::populateGpuToROCDLConversionPatterns(
 }
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
-                                      unsigned indexBitwidth,
-                                      bool useBarePtrCallConv,
-                                      gpu::amd::Runtime runtime) {
-  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
-      chipset, indexBitwidth, useBarePtrCallConv, runtime);
+mlir::createLowerGpuOpsToROCDLOpsPass() {
+  return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index d28aa9e34c22a..e8868aeda4dcb 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -763,3 +763,25 @@ gpu.module @test_module {
 gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
 
 }
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_subgroup_id()
+  func.func @gpu_subgroup_id() -> (index) {
+    // CHECK: %[[widx:.*]] = rocdl.workitem.id.x : i32
+    // CHECK: %[[widy:.*]] = rocdl.workitem.id.y : i32
+    // CHECK: %[[widz:.*]] = rocdl.workitem.id.z : i32
+    // CHECK: %[[dimx:.*]] = rocdl.workgroup.dim.x : i32
+    // CHECK: %[[dimy:.*]] = rocdl.workgroup.dim.y : i32
+    // CHECK: %[[int5:.*]] = llvm.mul %[[dimy]], %[[widz]] overflow<nsw, nuw> : i32
+    // CHECK: %[[int6:.*]] = llvm.add %[[int5]], %[[widy]] overflow<nsw, nuw> : i32
+    // CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow<nsw, nuw> : i32
+    // CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow<nsw, nuw> : i32
+    // CHECK: %[[wavefrontsize:.*]] = rocdl.wavefrontsize : i32
+    // CHECK: %[[result:.*]] = llvm.udiv %[[int8]], %[[wavefrontsize]] : i32
+    // CHECK: = llvm.sext %[[result]] : i32 to i64
+    %subgroupId = gpu.subgroup_id : index
+    func.return  %subgroupId :  index
+  }
+}