[Mlir-commits] [mlir] [mlir][gpu] Add `gpu.subgroup_uniform` op (PR #157743)

Wed Sep 10 08:04:43 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: Ivan Butygin (Hardcode84)

<details>
<summary>Changes</summary>

Introducing a dedicated op instead of broadcast `any_lane` per discussion https://github.com/llvm/llvm-project/pull/152808

---
Full diff: https://github.com/llvm/llvm-project/pull/157743.diff


5 Files Affected:

- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+33) 
- (modified) mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp (+18-1) 
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+9) 
- (modified) mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir (+12) 
- (modified) mlir/test/Dialect/GPU/ops.mlir (+8) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..3e2fa6b43d5fd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
   let hasVerifier = 1;
 }
 
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+    [Pure, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src)> {
+  let summary = "Assumes value is unform across the lanes in subgroup";
+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes
+    in a subgroup. This means that all active lanes in the subgroup are expected
+    to have the same value.
+
+    This op can be used to inform the compiler that a value is uniform across
+    the subgroup, enabling optimizations. The result is poison if the value
+    is not actually uniform.
+
+    This op is functionally no-op as no valid program should change its
+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent
+    active mask is always superset of current mask and if can hoist input
+    calculation you can hoist the operation itself as well.
+
+    Example:
+
+    ```mlir
+    %1 = gpu.subgroup_uniform %0 : f32
+    ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 807d1f52ee69b..5377ce709497e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
   }
 };
 
+struct GPUSubgroupUniformOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (!isSupportedReadLaneType(src.getType()))
+      return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+    rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
-               GPUSubgroupBroadcastOpToROCDL>(converter);
+               GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+      converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..6022fa517421a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index c6261b37ef8f2..69fb45d9097b8 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
   func.return %0, %1 : index, index
 }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+//  CHECK-SAME:   (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+  %0 = gpu.subgroup_uniform %arg0 : index
+  func.return %0 : index
+}
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..d45d1cf52d91d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
   %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1 : f32, f32
 }
+
+// CHECK-LABEL: func @subgroup_uniform
+//  CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+  %0 = gpu.subgroup_uniform %arg0 : f32
+  func.return %0 : f32
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/157743