[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)

Fri Apr 25 19:55:20 PDT 2025

================
@@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 };
 
+struct GPUSubgroupIdOpToROCDL final
+    : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Calculation of the thread's subgroup identifier.
+    //
+    // The process involves mapping the thread's 3D identifier within its
+    // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+    // This linearization assumes a layout where the x-dimension (w_dim.x)
+    // varies most rapidly (i.e., it is the innermost dimension).
+    //
+    // The formula for the linearized thread index is:
+    // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+    //
+    // Subsequently, the range of linearized indices [0, N_threads-1] is
+    // divided into consecutive, non-overlapping segments, each representing
+    // a subgroup of size 'subgroup_size'.
+    //
+    // Example Partitioning (N = subgroup_size):
+    // | Subgroup 0      | Subgroup 1      | Subgroup 2      | ... |
+    // | Indices 0..N-1  | Indices N..2N-1 | Indices 2N..3N-1| ... |
+    //
+    // The subgroup identifier is obtained via integer division of the
+    // linearized thread index by the predefined 'subgroup_size'.
+    //
+    // subgroup_id = floor( L / subgroup_size )
+    //             = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+    //             subgroup_size
+    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+    Location loc = op.getLoc();
+    LLVM::IntegerOverflowFlags flags =
+        LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+    Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
+    Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
+    Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
+    Value workitemDimX = rewriter.create<ROCDL::BlockDimXOp>(loc, int32Type);
+    Value workitemDimY = rewriter.create<ROCDL::BlockDimYOp>(loc, int32Type);
+    Value dimYxIdZ = rewriter.create<LLVM::MulOp>(loc, int32Type, workitemDimY,
+                                                  workitemIdZ, flags);
+    Value dimYxIdZPlusIdY = rewriter.create<LLVM::AddOp>(
+        loc, int32Type, dimYxIdZ, workitemIdY, flags);
+    Value dimYxIdZPlusIdYTimesDimX = rewriter.create<LLVM::MulOp>(
+        loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags);
+    Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
+        rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
+                                     dimYxIdZPlusIdYTimesDimX, flags);
+    Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
+        loc, rewriter.getI32Type(), nullptr);
----------------
kuhar wrote:

nit: add an inline comment with the name of the function parameter behind `nullptr`? It's hard to say what this is setting.

https://github.com/llvm/llvm-project/pull/136405