[Mlir-commits] [mlir] [mlir][gpu] Add `broadcast_lane` op (PR #152808)

Fri Aug 8 15:28:45 PDT 2025

https://github.com/Hardcode84 created https://github.com/llvm/llvm-project/pull/152808

`broadcast_lane` allow to broadcast the value from one lane to all lanes in subgroup.

Supported modes:
* `first_lane` - broadcast value from the first active lane in subgroup.
* `lane` - broadcast value from the specified lane, lane index must be withing subgroup.
* `any_lane` - if `src` value is uniform across all the subgroup lanes return it unchanged, otherwise result is poison. This variant essentially an uniformity hint for the compiler, conveying that specific value is uniform across all subgroup lanes. Dropping `any_lane` broadcast will not change the code semantics.

>From 6464cb81381e1f66b653675f42badf74073a041f Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sat, 9 Aug 2025 00:16:02 +0200
Subject: [PATCH] [mlir][gpu] Add `broadcast_lane` op

`broadcast_lane` allow to broadcast the value from one lane to all lanes in subgroup.

Supported modes:
* `first_lane` - broadcast value from the first active lane in subgroup.
* `lane` - broadcast value from the specified lane, lane index must be withing subgroup.
* `any_lane` - if `src` value is uniform across all the subgroup
lanes return it unchanged, otherwise result is poison. This variant
essentially an uniformity hint for the compiler, conveying that
specific value is uniform across all subgroup lanes. Dropping `any_lane`
broadcast will not change the code semantics.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 44 ++++++++++++++++++-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 25 ++++++++++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 37 ++++++++++++++++
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 18 +++++++-
 .../GPU/broadcast-speculatability.mlir        | 23 ++++++++++
 .../test/Dialect/GPU/int-range-interface.mlir | 19 ++++++++
 mlir/test/Dialect/GPU/ops.mlir                | 16 ++++++-
 7 files changed, 177 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Dialect/GPU/broadcast-speculatability.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index f946bb731e2ca..6592a5c55b0c2 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
     /// Sets the targets of the module.
     void setTargets(ArrayRef<TargetAttrInterface> targets);
   }];
-  
+
   let hasVerifier = 1;
 }
 
@@ -3212,4 +3212,46 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
   }];
 }
 
+def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
+    "a lane to broadcast from",
+    [
+      I32EnumAttrCase<"first_lane", 0>,
+      I32EnumAttrCase<"any_lane", 1>,
+      I32EnumAttrCase<"lane", 2>
+    ]>{
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::gpu";
+}
+def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
+
+def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
+    [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+    DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src,
+                 Optional<I32>:$lane,
+                 GPU_BroadcastTypeAttr:$broadcast_type)> {
+  let summary = "Broadcasts a value from the specific lane across subgroup";
+  let description = [{
+      Broadcasts the value from the one lane to the all lanes in subgroup.
+
+      The possible broadcats types are:
+
+      * `first_lane` - first active lane in subgroup.
+      * `lane` - from the specified lane, lane index must be withing subgroup.
+      * `any_lane` - if `src` value is uniform across all the subgroup
+      lanes return it unchanged, otherwise result is poison. This variant
+      essentially an uniformity hint for the compiler, conveying that
+      specific value is uniform across all subgroup lanes. Dropping `any_lane`
+      broadcast will not change the code semantics.
+      ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = [{
+    $src `,` $broadcast_type ($lane^)?  attr-dict `:` type($result)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d22364e1ef441..4d081cefb5f35 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -160,6 +160,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
   const amdgpu::Chipset chipset;
 };
 
+struct GPUBroadcastLaneOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::BroadcastLaneOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::BroadcastLaneOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (adaptor.getBroadcastType() == gpu::BroadcastType::lane) {
+      rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
+                                                     adaptor.getLane());
+    } else { // first_lane or any_lane
+      // any_lane is lowered to readfirstlane too, to force value into scalar
+      // register.
+      rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
+                                                          src);
+    }
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -453,7 +474,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
   // TODO: Add alignment for workgroup memory
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
-  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+  patterns
+      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUBroadcastLaneOpToROCDL>(
+          converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 2503ccb6a2cfe..2b1adc90477af 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2511,6 +2511,43 @@ bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) {
       verifyDistributedType(lhs, rhs, getWarpSize(), getOperation()));
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_BroadcastLaneOp
+//===----------------------------------------------------------------------===//
+
+void gpu::BroadcastLaneOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
+Speculation::Speculatability gpu::BroadcastLaneOp::getSpeculatability() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_lane:
+    // Cannot speculate first_lane broadcast, because speculating it across
+    // control flow can change the active lanes.
+    return Speculation::NotSpeculatable;
+  case BroadcastType::any_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::lane:
+    return Speculation::Speculatable;
+  }
+}
+
+LogicalResult gpu::BroadcastLaneOp::verify() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::any_lane:
+    if (getLane())
+      return emitOpError() << "lane can only be specified for lane broadcast";
+    return success();
+  case BroadcastType::lane:
+    if (!getLane())
+      return emitOpError() << "lane must be specified for lane broadcast";
+    return success();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2b6adffc81f72..ed62e1c689ae3 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -701,7 +701,7 @@ gpu.module @test_module {
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
     // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
-    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
     // *** UP mode shuffle ***
     // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
     // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
@@ -776,3 +776,19 @@ gpu.module @test_module {
     func.return %bDimX : index
   }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @broadcast
+//  CHECK-SAME:   (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
+func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
+  %0 = gpu.broadcast_lane %arg0, first_lane : index
+  %1 = gpu.broadcast_lane %arg0, any_lane : index
+  %2 = gpu.broadcast_lane %arg0, lane %arg1 : index
+  func.return %0, %1, %2 : index, index, index
+}
+}
diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
new file mode 100644
index 0000000000000..facbe8761c1fd
--- /dev/null
+++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
+
+func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
+
+// CHECK-LABEL: func @broadcast_hoisting
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
+func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+// CHECK: %[[V1:.*]] = gpu.broadcast_lane %[[ARG]], any_lane : f32
+// CHECK: %[[V2:.*]] = gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
+// CHECK: scf.for
+// CHECK: %[[V0:.*]] = gpu.broadcast_lane %[[ARG]], first_lane : f32
+// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
+  scf.for %i = %c0 to %c10 step %c1 {
+    %0 = gpu.broadcast_lane %arg0, first_lane : f32
+    %1 = gpu.broadcast_lane %arg0, any_lane : f32
+    %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+    func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
+  }
+  func.return
+}
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index 1613f83b17bde..cfb99283652a2 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
     }
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @broadcast
+func.func @broadcast(%idx: i32) {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %1 = gpu.broadcast_lane %0, first_lane : index
+  %2 = gpu.broadcast_lane %0, any_lane : index
+  %3 = gpu.broadcast_lane %0, lane %idx : index
+
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+
+  %4 = test.reflect_bounds %1 : index
+  %5 = test.reflect_bounds %2 : index
+  %6 = test.reflect_bounds %3 : index
+  return
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 9cc0bf8f41d5a..95b6d21097a37 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
       // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
       // CHECK-NEXT: gpu.yield %{{.*}} : f32
       // CHECK-NEXT: } : (f32) -> f32
-      %sum2 = gpu.all_reduce %one { 
+      %sum2 = gpu.all_reduce %one {
       ^bb(%lhs : f32, %rhs : f32):
         %tmp = arith.addf %lhs, %rhs : f32
         gpu.yield %tmp : f32
@@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
       %1 = arith.cmpi slt, %arg0, %arg0 : i32
       scf.if %1 {
         gpu.printf ", "
-      } 
+      }
       gpu.return
     }
 
@@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
   }
   return %2 : vector<4xi32>
 }
+
+// CHECK-LABEL: func @broadcast_lane
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
+func.func @broadcast_lane(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
+  // CHECK: gpu.broadcast_lane %[[ARG]], first_lane : f32
+  %0 = gpu.broadcast_lane %arg0, first_lane : f32
+  // CHECK: gpu.broadcast_lane %[[ARG]], any_lane : f32
+  %1 = gpu.broadcast_lane %arg0, any_lane : f32
+  // CHECK: gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
+  %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+  func.return %0, %1, %2 : f32, f32, f32
+}