[Mlir-commits] [mlir] [mlir][amdgpu] Introduce `assume_subgroup_uniform` op (PR #152740)
Ivan Butygin
llvmlistbot at llvm.org
Fri Aug 8 08:02:41 PDT 2025
https://github.com/Hardcode84 created https://github.com/llvm/llvm-project/pull/152740
`assume_subgroup_uniform` works as compiler hint to force the specific value into scalar register.
Currently implemented via `readfirstlane` intrinsic.
Unlike direct `readfirstlane` call, this op is potentially speculatable and have a usual arith and int range interfaces.
>From d5dfd6f8667a8337de2c30e1e6c746bbec58a682 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Fri, 8 Aug 2025 12:20:32 +0200
Subject: [PATCH] [mlir][amdgpu] Introduce `assume_subgroup_uniform` op
`assume_subgroup_uniform` works as compiler hint to force the specific value into scalar register.
Currently implemented via `readfirstlane` intrinsic.
Unlike direct `readfirstlane` call, this op is potentially speculatable and have a usual arith and int range interfaces.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 38 +++++++++++++++++--
.../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 1 +
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 17 ++++++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 16 ++++++++
.../AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 14 +++++++
mlir/test/Dialect/AMDGPU/canonicalize.mlir | 10 +++++
mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++
.../AMDGPU/subgroup-uniform-int-range.mlir | 13 +++++++
.../subgroup-uniform-speculability.mlir | 21 ++++++++++
9 files changed, 136 insertions(+), 4 deletions(-)
create mode 100644 mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir
create mode 100644 mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 2c646934c11c2..b0b94ed49f2e5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -9,12 +9,13 @@
#ifndef AMDGPU
#define AMDGPU
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/Properties.td"
+include "mlir/Interfaces/InferIntRangeInterface.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir/IR/EnumAttr.td"
-include "mlir/IR/Properties.td"
-include "mlir/IR/OpBase.td"
def AMDGPU_Dialect : Dialect {
let name = "amdgpu";
@@ -635,6 +636,37 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp",
let hasVerifier = 1;
}
+def AMDGPU_AssumeSubgroupUniformOp : AMDGPU_Op<"assume_subgroup_uniform",
+ [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
+ DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+ DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
+ ElementwiseMappable.traits>,
+ Arguments<(ins AnyType:$src,
+ DefaultValuedAttr<UnitAttr, "false">:$all_lanes)> {
+ let summary = "Assumes value is unform across the lanes in subgroup";
+ let description = [{
+ This op is a compiler hint to help backend put values into scalar registers.
+
+ If `src` value is uniform across all the active subgroup lanes it is
+ returned unchanged, otherwise result is poison.
+
+ If `all_lanes` is set, the value is assumed to be uniform across all the
+ subgroup lanes, this can allow to speculate it out of control flow, which
+ may change the current active lanes, i.e:
+ ```
+ // %value must be uniform at this point
+ %value = ...
+ scf.if lane_id < 13 {
+ %uniform = amdgpu.assume_subgroup_uniform all_lanes %value
+ }
+ ```
+ }];
+ let results = (outs AnyType:$result);
+ let assemblyFormat = [{
+ (`all_lanes` $all_lanes^)? $src attr-dict `:` type($result)
+ }];
+}
+
def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
[Pure, AllTypesMatch<["result", "src"]>]>,
Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index 3de57c923178a..196ce08b5954c 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -18,6 +18,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferIntRangeInterface.h"
#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 64720bfe6cf50..3f52309005690 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1876,6 +1876,19 @@ struct AMDGPUSwizzleBitModeLowering
}
};
+struct AMDGPUAssumeSubgroupUniformLowering
+ : public ConvertOpToLLVMPattern<AssumeSubgroupUniformOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(AssumeSubgroupUniformOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value src = adaptor.getSrc();
+ rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+ return success();
+ }
+};
+
struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
@@ -1945,5 +1958,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
TransposeLoadOpLowering>(converter, chipset);
- patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
+ patterns
+ .add<AMDGPUSwizzleBitModeLowering, AMDGPUAssumeSubgroupUniformLowering>(
+ converter);
}
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index d7ffdcb58ddb5..0115a85ba0bfe 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -510,6 +510,22 @@ LogicalResult DPPOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// AssumeSubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void AssumeSubgroupUniformOp::inferResultRanges(
+ ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+ setResultRange(getResult(), argRanges.front());
+}
+
+Speculation::Speculatability AssumeSubgroupUniformOp::getSpeculatability() {
+ if (getAllLanes())
+ return Speculation::Speculatable;
+
+ return Speculation::NotSpeculatable;
+}
+
//===----------------------------------------------------------------------===//
// GatherToLDSOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index cc1162d8b0de8..6eaf68f84e38f 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -461,3 +461,17 @@ func.func @sched_barrier() {
amdgpu.sched_barrier allow = <valu|all_vmem>
func.return
}
+
+// CHECK-LABEL: func @assume_subgroup_uniform
+// CHECK-SAME: (%[[ARG:.*]]: index)
+func.func @assume_subgroup_uniform(%arg0 : index) -> (index, index) {
+// CHECK: %[[SRC:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : index to i64
+// CHECK: %[[V1:.*]] = rocdl.readfirstlane %[[SRC]] : i64
+// CHECK: %[[RES1:.*]] = builtin.unrealized_conversion_cast %[[V1]] : i64 to index
+// CHECK: %[[V2:.*]] = rocdl.readfirstlane %[[SRC]] : i64
+// CHECK: %[[RES2:.*]] = builtin.unrealized_conversion_cast %[[V2]] : i64 to index
+// CHECK: return %[[RES1]], %[[RES2]] : index, index
+ %0 = amdgpu.assume_subgroup_uniform %arg0 : index
+ %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : index
+ func.return %0, %1 : index, index
+}
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index 5501ad42dbd90..141bd3f459738 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -159,3 +159,13 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
: f32, memref<128x72xf32, 1>, memref<?x?xf32, 3>
func.return
}
+
+// -----
+
+// CHECK-LABEL: func @assume_subgroup_uniform_unused
+func.func @assume_subgroup_uniform_unused(%arg0 : f32) {
+// CHECK-NOT: amdgpu.assume_subgroup_uniform
+ %0 = amdgpu.assume_subgroup_uniform %arg0 : f32
+ %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 87e11c028c62a..97b4d5f54506f 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -517,6 +517,16 @@ func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16>
func.return %0 : vector<8xf16>
}
+// CHECK-LABEL: func @assume_subgroup_uniform
+// CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @assume_subgroup_uniform(%arg0 : f32) -> (f32, f32) {
+ // CHECK: amdgpu.assume_subgroup_uniform %[[ARG]] : f32
+ %0 = amdgpu.assume_subgroup_uniform %arg0 : f32
+ // CHECK: amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
+ %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
+ func.return %0, %1 : f32, f32
+}
+
// CHECK-LABEL: func @swizzle_bitmode
func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
// CHECK: amdgpu.swizzle_bitmode
diff --git a/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir b/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir
new file mode 100644
index 0000000000000..be20bfdba3baf
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt --arith-int-range-narrowing="int-bitwidths-supported=32" --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @narrow
+// CHECK: %[[SRC:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+// CHECK: %[[CAST1:.*]] = arith.index_castui %[[SRC]] : index to i32
+// CHECK: %[[VAL:.*]] = amdgpu.assume_subgroup_uniform %[[CAST1]] : i32
+// CHECK: %[[CAST2:.*]] = arith.index_castui %[[VAL]] : i32 to index
+// CHECK: return %[[CAST2]] : index
+func.func @narrow() -> index {
+ %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+ %1 = amdgpu.assume_subgroup_uniform %0 : index
+ return %1: index
+}
diff --git a/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir b/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir
new file mode 100644
index 0000000000000..9be2b5dda267e
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
+
+func.func private @side_effect(%arg0 : f32, %arg1 : f32)
+
+// CHECK-LABEL: func @assume_subgroup_uniform_hoisting
+// CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @assume_subgroup_uniform_hoisting(%arg0 : f32) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+// CHECK: %[[V1:.*]] = amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
+// CHECK: scf.for
+// CHECK: %[[V0:.*]] = amdgpu.assume_subgroup_uniform %[[ARG]] : f32
+// CHECK: func.call @side_effect(%[[V0]], %[[V1]])
+ scf.for %i = %c0 to %c10 step %c1 {
+ %0 = amdgpu.assume_subgroup_uniform %arg0 : f32
+ %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
+ func.call @side_effect(%0, %1) : (f32, f32) -> ()
+ }
+ func.return
+}
More information about the Mlir-commits
mailing list