[Mlir-commits] [mlir] [mlir][gpu] Add `subgroup_broadcast` op (PR #152808)

Fri Aug 22 23:19:31 PDT 2025

https://github.com/Hardcode84 updated https://github.com/llvm/llvm-project/pull/152808

>From b9c96eb744e4de01e6cc1f0c35155b8de2e5967c Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sat, 9 Aug 2025 00:16:02 +0200
Subject: [PATCH 1/3] [mlir][gpu] Add `broadcast_lane` op

`broadcast_lane` allow to broadcast the value from one lane to all lanes in subgroup.

Supported modes:
* `first_lane` - broadcast value from the first active lane in subgroup.
* `lane` - broadcast value from the specified lane, lane index must be withing subgroup.
* `any_lane` - if `src` value is uniform across all the subgroup
lanes return it unchanged, otherwise result is poison. This variant
essentially an uniformity hint for the compiler, conveying that
specific value is uniform across all subgroup lanes. Dropping `any_lane`
broadcast will not change the code semantics.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 44 ++++++++++++++++++-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 25 ++++++++++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 37 ++++++++++++++++
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 16 +++++++
 .../GPU/broadcast-speculatability.mlir        | 23 ++++++++++
 .../test/Dialect/GPU/int-range-interface.mlir | 19 ++++++++
 mlir/test/Dialect/GPU/ops.mlir                | 16 ++++++-
 7 files changed, 176 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Dialect/GPU/broadcast-speculatability.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index a5c3a92f1b7a5..f4e1f8fa21d09 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
     /// Sets the targets of the module.
     void setTargets(ArrayRef<TargetAttrInterface> targets);
   }];
-  
+
   let hasVerifier = 1;
 }
 
@@ -3215,4 +3215,46 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
   }];
 }
 
+def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
+    "a lane to broadcast from",
+    [
+      I32EnumAttrCase<"first_lane", 0>,
+      I32EnumAttrCase<"any_lane", 1>,
+      I32EnumAttrCase<"lane", 2>
+    ]>{
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::gpu";
+}
+def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
+
+def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
+    [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+    DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src,
+                 Optional<I32>:$lane,
+                 GPU_BroadcastTypeAttr:$broadcast_type)> {
+  let summary = "Broadcasts a value from the specific lane across subgroup";
+  let description = [{
+      Broadcasts the value from the one lane to the all lanes in subgroup.
+
+      The possible broadcats types are:
+
+      * `first_lane` - first active lane in subgroup.
+      * `lane` - from the specified lane, lane index must be withing subgroup.
+      * `any_lane` - if `src` value is uniform across all the subgroup
+      lanes return it unchanged, otherwise result is poison. This variant
+      essentially an uniformity hint for the compiler, conveying that
+      specific value is uniform across all subgroup lanes. Dropping `any_lane`
+      broadcast will not change the code semantics.
+      ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = [{
+    $src `,` $broadcast_type ($lane^)?  attr-dict `:` type($result)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e6fbcf98950a4..83c34aa7a0d6a 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -171,6 +171,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
   const amdgpu::Chipset chipset;
 };
 
+struct GPUBroadcastLaneOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::BroadcastLaneOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::BroadcastLaneOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (adaptor.getBroadcastType() == gpu::BroadcastType::lane) {
+      rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
+                                                     adaptor.getLane());
+    } else { // first_lane or any_lane
+      // any_lane is lowered to readfirstlane too, to force value into scalar
+      // register.
+      rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
+                                                          src);
+    }
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -463,7 +484,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
   // TODO: Add alignment for workgroup memory
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
-  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+  patterns
+      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUBroadcastLaneOpToROCDL>(
+          converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index cc77aa6711c42..e93686feac6a8 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2514,6 +2514,43 @@ gpu::YieldOp WarpExecuteOnLane0Op::getTerminator() {
   return cast<gpu::YieldOp>(getBody()->getTerminator());
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_BroadcastLaneOp
+//===----------------------------------------------------------------------===//
+
+void gpu::BroadcastLaneOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
+Speculation::Speculatability gpu::BroadcastLaneOp::getSpeculatability() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_lane:
+    // Cannot speculate first_lane broadcast, because speculating it across
+    // control flow can change the active lanes.
+    return Speculation::NotSpeculatable;
+  case BroadcastType::any_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::lane:
+    return Speculation::Speculatable;
+  }
+}
+
+LogicalResult gpu::BroadcastLaneOp::verify() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::any_lane:
+    if (getLane())
+      return emitOpError() << "lane can only be specified for lane broadcast";
+    return success();
+  case BroadcastType::lane:
+    if (!getLane())
+      return emitOpError() << "lane must be specified for lane broadcast";
+    return success();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index fa4a9749f6a9b..e9361a364e346 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -776,3 +776,19 @@ gpu.module @test_module {
     func.return %bDimX : index
   }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @broadcast
+//  CHECK-SAME:   (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
+func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
+  %0 = gpu.broadcast_lane %arg0, first_lane : index
+  %1 = gpu.broadcast_lane %arg0, any_lane : index
+  %2 = gpu.broadcast_lane %arg0, lane %arg1 : index
+  func.return %0, %1, %2 : index, index, index
+}
+}
diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
new file mode 100644
index 0000000000000..facbe8761c1fd
--- /dev/null
+++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
+
+func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
+
+// CHECK-LABEL: func @broadcast_hoisting
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
+func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+// CHECK: %[[V1:.*]] = gpu.broadcast_lane %[[ARG]], any_lane : f32
+// CHECK: %[[V2:.*]] = gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
+// CHECK: scf.for
+// CHECK: %[[V0:.*]] = gpu.broadcast_lane %[[ARG]], first_lane : f32
+// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
+  scf.for %i = %c0 to %c10 step %c1 {
+    %0 = gpu.broadcast_lane %arg0, first_lane : f32
+    %1 = gpu.broadcast_lane %arg0, any_lane : f32
+    %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+    func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
+  }
+  func.return
+}
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index 1613f83b17bde..cfb99283652a2 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
     }
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @broadcast
+func.func @broadcast(%idx: i32) {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %1 = gpu.broadcast_lane %0, first_lane : index
+  %2 = gpu.broadcast_lane %0, any_lane : index
+  %3 = gpu.broadcast_lane %0, lane %idx : index
+
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+
+  %4 = test.reflect_bounds %1 : index
+  %5 = test.reflect_bounds %2 : index
+  %6 = test.reflect_bounds %3 : index
+  return
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 9cc0bf8f41d5a..95b6d21097a37 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
       // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
       // CHECK-NEXT: gpu.yield %{{.*}} : f32
       // CHECK-NEXT: } : (f32) -> f32
-      %sum2 = gpu.all_reduce %one { 
+      %sum2 = gpu.all_reduce %one {
       ^bb(%lhs : f32, %rhs : f32):
         %tmp = arith.addf %lhs, %rhs : f32
         gpu.yield %tmp : f32
@@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
       %1 = arith.cmpi slt, %arg0, %arg0 : i32
       scf.if %1 {
         gpu.printf ", "
-      } 
+      }
       gpu.return
     }
 
@@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
   }
   return %2 : vector<4xi32>
 }
+
+// CHECK-LABEL: func @broadcast_lane
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
+func.func @broadcast_lane(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
+  // CHECK: gpu.broadcast_lane %[[ARG]], first_lane : f32
+  %0 = gpu.broadcast_lane %arg0, first_lane : f32
+  // CHECK: gpu.broadcast_lane %[[ARG]], any_lane : f32
+  %1 = gpu.broadcast_lane %arg0, any_lane : f32
+  // CHECK: gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
+  %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+  func.return %0, %1, %2 : f32, f32, f32
+}

>From c40f6fccb1c9dd04c495f8d33c1d662bc59719ff Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sat, 9 Aug 2025 23:51:41 +0200
Subject: [PATCH 2/3] update doc

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 31 +++++++++++++---------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index f4e1f8fa21d09..3dee26e9f4517 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3237,22 +3237,27 @@ def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
                  GPU_BroadcastTypeAttr:$broadcast_type)> {
   let summary = "Broadcasts a value from the specific lane across subgroup";
   let description = [{
-      Broadcasts the value from the one lane to the all lanes in subgroup.
-
-      The possible broadcats types are:
-
-      * `first_lane` - first active lane in subgroup.
-      * `lane` - from the specified lane, lane index must be withing subgroup.
-      * `any_lane` - if `src` value is uniform across all the subgroup
-      lanes return it unchanged, otherwise result is poison. This variant
-      essentially an uniformity hint for the compiler, conveying that
-      specific value is uniform across all subgroup lanes. Dropping `any_lane`
-      broadcast will not change the code semantics.
-      ```
+      Broadcasts a value from one lane to all lanes in a subgroup. The
+      result is guaranteed to be uniform across the subgroup.
+
+      The possible broadcast types are:
+
+      * `first_lane` - broadcasts the value from the first active lane in the
+      subgroup.
+      * `lane` - broadcasts from the specified lane. The lane index must be
+      uniform and within the subgroup size. The result is poison if the lane
+      index is invalid or non-subgroup-uniform.
+      * `any_lane` - broadcasts the value from any lane of the subgroup,
+      active or inactive, assuming the input is already subgroup uniform. The
+      result is poison if the input is not uniform. This is useful to convey
+      uniformity to the compiler to enable more optimizations. Also, it allows
+      more speculation opportunities than `first_lane` since `first_lane`
+      results can depend on active lanes which may change during speculation
+      across control flow.
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
-    $src `,` $broadcast_type ($lane^)?  attr-dict `:` type($result)
+    $src `,` $broadcast_type ($lane^)? attr-dict `:` type($result)
   }];
   let hasVerifier = 1;
 }

>From 0af0dcf0b94015a637aaa82f6368235d7cbd992c Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Wed, 13 Aug 2025 09:48:50 +0200
Subject: [PATCH 3/3] renamings

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 33 ++++++++++---------
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 15 ++++-----
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 22 +++++++------
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   |  6 ++--
 .../GPU/broadcast-speculatability.mlir        | 21 ++++++------
 .../test/Dialect/GPU/int-range-interface.mlir |  6 ++--
 mlir/test/Dialect/GPU/ops.mlir                | 16 ++++-----
 7 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 3dee26e9f4517..3fb0cfe7e2a54 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3218,16 +3218,16 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
 def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
     "a lane to broadcast from",
     [
-      I32EnumAttrCase<"first_lane", 0>,
+      I32EnumAttrCase<"first_active_lane", 0>,
       I32EnumAttrCase<"any_lane", 1>,
-      I32EnumAttrCase<"lane", 2>
+      I32EnumAttrCase<"specific_lane", 2>
     ]>{
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::gpu";
 }
 def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
 
-def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
+def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
     [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
     DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
@@ -3237,23 +3237,24 @@ def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
                  GPU_BroadcastTypeAttr:$broadcast_type)> {
   let summary = "Broadcasts a value from the specific lane across subgroup";
   let description = [{
-      Broadcasts a value from one lane to all lanes in a subgroup. The
-      result is guaranteed to be uniform across the subgroup.
+      Broadcasts a value from one lane to all active lanes in a subgroup. The
+      result is guaranteed to be uniform across the active lanes in subgroup.
 
       The possible broadcast types are:
 
-      * `first_lane` - broadcasts the value from the first active lane in the
-      subgroup.
-      * `lane` - broadcasts from the specified lane. The lane index must be
-      uniform and within the subgroup size. The result is poison if the lane
-      index is invalid or non-subgroup-uniform.
+      * `first_active_lane` - broadcasts the value from the first active lane
+      in the subgroup.
+      * `specific_lane` - broadcasts from the specified lane. The lane index
+      must be uniform and within the subgroup size. The result is poison if the
+      lane index is invalid, non subgroup-uniform, or if the source lane is not
+      active.
       * `any_lane` - broadcasts the value from any lane of the subgroup,
-      active or inactive, assuming the input is already subgroup uniform. The
-      result is poison if the input is not uniform. This is useful to convey
-      uniformity to the compiler to enable more optimizations. Also, it allows
-      more speculation opportunities than `first_lane` since `first_lane`
-      results can depend on active lanes which may change during speculation
-      across control flow.
+      assuming the input is already subgroup uniform. The result is poison if
+      the input is not uniform. This is useful to convey uniformity to the
+      compiler to enable more optimizations. Also, it allows more speculation
+      opportunities than `first_active_lane` since `first_active_lane` results
+      can depend on active lanes which may change during speculation across
+      control flow.
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 83c34aa7a0d6a..ae1e3fb3adcb9 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -171,18 +171,18 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
   const amdgpu::Chipset chipset;
 };
 
-struct GPUBroadcastLaneOpToROCDL
-    : public ConvertOpToLLVMPattern<gpu::BroadcastLaneOp> {
+struct GPUSubgroupBroadcastOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(gpu::BroadcastLaneOp op, OpAdaptor adaptor,
+  matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Value src = adaptor.getSrc();
-    if (adaptor.getBroadcastType() == gpu::BroadcastType::lane) {
+    if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
       rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
                                                      adaptor.getLane());
-    } else { // first_lane or any_lane
+    } else { // first_active_lane or any_lane
       // any_lane is lowered to readfirstlane too, to force value into scalar
       // register.
       rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
@@ -484,9 +484,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   // TODO: Add alignment for workgroup memory
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
-  patterns
-      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUBroadcastLaneOpToROCDL>(
-          converter);
+  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
+               GPUSubgroupBroadcastOpToROCDL>(converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e93686feac6a8..bc81c4e9653c9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2515,38 +2515,40 @@ gpu::YieldOp WarpExecuteOnLane0Op::getTerminator() {
 }
 
 //===----------------------------------------------------------------------===//
-// GPU_BroadcastLaneOp
+// GPU_SubgroupBroadcastOp
 //===----------------------------------------------------------------------===//
 
-void gpu::BroadcastLaneOp::inferResultRanges(
+void gpu::SubgroupBroadcastOp::inferResultRanges(
     ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
   setResultRange(getResult(), argRanges.front());
 }
 
-Speculation::Speculatability gpu::BroadcastLaneOp::getSpeculatability() {
+Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
   switch (getBroadcastType()) {
-  case BroadcastType::first_lane:
+  case BroadcastType::first_active_lane:
     // Cannot speculate first_lane broadcast, because speculating it across
     // control flow can change the active lanes.
     return Speculation::NotSpeculatable;
   case BroadcastType::any_lane:
     LLVM_FALLTHROUGH;
-  case BroadcastType::lane:
+  case BroadcastType::specific_lane:
     return Speculation::Speculatable;
   }
 }
 
-LogicalResult gpu::BroadcastLaneOp::verify() {
+LogicalResult gpu::SubgroupBroadcastOp::verify() {
   switch (getBroadcastType()) {
-  case BroadcastType::first_lane:
+  case BroadcastType::first_active_lane:
     LLVM_FALLTHROUGH;
   case BroadcastType::any_lane:
     if (getLane())
-      return emitOpError() << "lane can only be specified for lane broadcast";
+      return emitOpError()
+             << "lane can only be specified for `specific_lane` broadcast";
     return success();
-  case BroadcastType::lane:
+  case BroadcastType::specific_lane:
     if (!getLane())
-      return emitOpError() << "lane must be specified for lane broadcast";
+      return emitOpError()
+             << "lane must be specified for `specific_lane` broadcast";
     return success();
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index e9361a364e346..3143526fd0da6 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -786,9 +786,9 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
 //       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
 //       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
 //       CHECK:   %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
-  %0 = gpu.broadcast_lane %arg0, first_lane : index
-  %1 = gpu.broadcast_lane %arg0, any_lane : index
-  %2 = gpu.broadcast_lane %arg0, lane %arg1 : index
+  %0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
+  %1 = gpu.subgroup_broadcast %arg0, any_lane : index
+  %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
   func.return %0, %1, %2 : index, index, index
 }
 }
diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
index facbe8761c1fd..ea32d62756c35 100644
--- a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
+++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
@@ -3,20 +3,21 @@
 func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
 
 // CHECK-LABEL: func @broadcast_hoisting
-//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
-func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32) {
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
+func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-// CHECK: %[[V1:.*]] = gpu.broadcast_lane %[[ARG]], any_lane : f32
-// CHECK: %[[V2:.*]] = gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
+// `any_lane` and `specific_lane` can be speculated across the control flow, but
+// `first_active_lane` cannot as active lanes can change.
+// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
+// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
 // CHECK: scf.for
-// CHECK: %[[V0:.*]] = gpu.broadcast_lane %[[ARG]], first_lane : f32
+// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
 // CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
-  scf.for %i = %c0 to %c10 step %c1 {
-    %0 = gpu.broadcast_lane %arg0, first_lane : f32
-    %1 = gpu.broadcast_lane %arg0, any_lane : f32
-    %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+  scf.for %i = %c0 to %arg2 step %c1 {
+    %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
+    %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
+    %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
     func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
   }
   func.return
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index cfb99283652a2..2e92db0f342aa 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -335,9 +335,9 @@ module attributes {gpu.container_module} {
 // CHECK-LABEL: func @broadcast
 func.func @broadcast(%idx: i32) {
   %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
-  %1 = gpu.broadcast_lane %0, first_lane : index
-  %2 = gpu.broadcast_lane %0, any_lane : index
-  %3 = gpu.broadcast_lane %0, lane %idx : index
+  %1 = gpu.subgroup_broadcast %0, first_active_lane : index
+  %2 = gpu.subgroup_broadcast %0, any_lane : index
+  %3 = gpu.subgroup_broadcast %0, specific_lane %idx : index
 
   // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
   // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 95b6d21097a37..cd889f8025e6f 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -543,14 +543,14 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
   return %2 : vector<4xi32>
 }
 
-// CHECK-LABEL: func @broadcast_lane
+// CHECK-LABEL: func @subgroup_broadcast
 //  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
-func.func @broadcast_lane(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
-  // CHECK: gpu.broadcast_lane %[[ARG]], first_lane : f32
-  %0 = gpu.broadcast_lane %arg0, first_lane : f32
-  // CHECK: gpu.broadcast_lane %[[ARG]], any_lane : f32
-  %1 = gpu.broadcast_lane %arg0, any_lane : f32
-  // CHECK: gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
-  %2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
+func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
+  %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
+  %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
+  %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1, %2 : f32, f32, f32
 }