[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)
Jakub Kuderski
llvmlistbot at llvm.org
Fri Apr 25 19:55:20 PDT 2025
================
@@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
+struct GPUSubgroupIdOpToROCDL final
+ : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Calculation of the thread's subgroup identifier.
+ //
+ // The process involves mapping the thread's 3D identifier within its
+ // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+ // This linearization assumes a layout where the x-dimension (w_dim.x)
+ // varies most rapidly (i.e., it is the innermost dimension).
+ //
+ // The formula for the linearized thread index is:
+ // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+ //
+ // Subsequently, the range of linearized indices [0, N_threads-1] is
+ // divided into consecutive, non-overlapping segments, each representing
+ // a subgroup of size 'subgroup_size'.
+ //
+ // Example Partitioning (N = subgroup_size):
+ // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... |
+ // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... |
+ //
+ // The subgroup identifier is obtained via integer division of the
+ // linearized thread index by the predefined 'subgroup_size'.
+ //
+ // subgroup_id = floor( L / subgroup_size )
+ // = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+ // subgroup_size
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Location loc = op.getLoc();
+ LLVM::IntegerOverflowFlags flags =
+ LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+ Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
+ Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
+ Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
+ Value workitemDimX = rewriter.create<ROCDL::BlockDimXOp>(loc, int32Type);
+ Value workitemDimY = rewriter.create<ROCDL::BlockDimYOp>(loc, int32Type);
+ Value dimYxIdZ = rewriter.create<LLVM::MulOp>(loc, int32Type, workitemDimY,
+ workitemIdZ, flags);
+ Value dimYxIdZPlusIdY = rewriter.create<LLVM::AddOp>(
+ loc, int32Type, dimYxIdZ, workitemIdY, flags);
+ Value dimYxIdZPlusIdYTimesDimX = rewriter.create<LLVM::MulOp>(
+ loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags);
+ Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
+ rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
+ dimYxIdZPlusIdYTimesDimX, flags);
+ Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
+ loc, rewriter.getI32Type(), nullptr);
----------------
kuhar wrote:
nit: add an inline comment with the name of the function parameter behind `nullptr`? It's hard to say what this is setting.
https://github.com/llvm/llvm-project/pull/136405
More information about the Mlir-commits
mailing list