[flang-commits] [flang] [mlir] [mlir][GPU] Refactor GPUOps lowering (PR #188905)
Hocky Yudhiono via flang-commits
flang-commits at lists.llvm.org
Mon Mar 30 03:51:17 PDT 2026
https://github.com/hockyy updated https://github.com/llvm/llvm-project/pull/188905
>From 1ea904608dfdb69f2550f861bf01616961de56d3 Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 30 Mar 2026 15:46:20 +0800
Subject: [PATCH 1/2] [mlir][GPU] Refactor GPUOps lowering
---
.../CUDA/CUFDeviceFuncTransform.cpp | 3 +-
.../LLVMCommon/LowerFunctionAttrsToLLVM.h | 39 +++++
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 53 +++----
mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp | 53 +------
.../Conversion/GPUCommon/GPUOpsLowering.cpp | 134 +++++++++---------
.../lib/Conversion/GPUCommon/GPUOpsLowering.h | 7 +
mlir/lib/Conversion/LLVMCommon/CMakeLists.txt | 1 +
.../LLVMCommon/LowerFunctionAttrsToLLVM.cpp | 63 ++++++++
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 70 +++++----
.../GPU/Transforms/KernelOutlining.cpp | 9 +-
.../Transforms/SparseGPUCodegen.cpp | 3 +-
.../Transforms/XeGPUSubgroupDistribute.cpp | 2 +-
12 files changed, 245 insertions(+), 192 deletions(-)
create mode 100644 mlir/include/mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h
create mode 100644 mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
index 4532af98d8390..88382cafcc3bd 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
@@ -68,8 +68,7 @@ class CUFDeviceFuncTransform
gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
mlir::TypeRange{}, mlir::TypeRange{});
if (isGlobal)
- deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
- builder.getUnitAttr());
+ deviceFuncOp.setKernel(true);
mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
mlir::Block &entryBlock = deviceFuncBody.front();
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h b/mlir/include/mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h
new file mode 100644
index 0000000000000..f12f9d2c90e2e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h
@@ -0,0 +1,39 @@
+//===- LowerFunctionAttrsToLLVM.h - Lower func attrs to llvm.func - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared helpers for lowering discardable attributes on any FunctionOpInterface
+// (e.g. func.func, gpu.func) into llvm.func properties and discardables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_LLVMCOMMON_LOWERFUNCTIONATTRSTOLLVM_H
+#define MLIR_CONVERSION_LLVMCOMMON_LOWERFUNCTIONATTRSTOLLVM_H
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+
+namespace mlir {
+
+/// Result of lowering discardable attributes from a `FunctionOpInterface` to
+/// what `llvm.func` expects: typed inherent properties plus remaining
+/// discardable attributes.
+struct LowerLLVMFuncAttrs {
+ LLVM::LLVMFuncOp::Properties properties;
+ NamedAttrList discardableAttrs;
+};
+
+/// Lower discardable attributes on `func`, splitting `llvm.*` inherent
+/// properties from other attributes to forward to `llvm.func`. Returns
+/// failure if an `llvm.*` property bundle is invalid for `llvm.func`.
+FailureOr<LowerLLVMFuncAttrs>
+lowerDiscardableAttrsForLLVMFunc(FunctionOpInterface func);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LLVMCOMMON_LOWERFUNCTIONATTRSTOLLVM_H
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index f0a4dd44c8f67..a607bfc57f4a4 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -426,7 +426,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
GPU_OptionalDimSizeHintAttr:$known_block_size,
GPU_OptionalDimSizeHintAttr:$known_grid_size,
- GPU_OptionalDimSizeHintAttr:$known_cluster_size);
+ GPU_OptionalDimSizeHintAttr:$known_cluster_size,
+ OptionalAttr<ConfinedAttr<I64Attr, [IntNonNegative]>>:$workgroup_attributions,
+ UnitAttr:$kernel);
let regions = (region AnyRegion:$body);
let skipDefaultBuilders = 1;
@@ -440,22 +442,24 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
let extraClassDeclaration = [{
/// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
- /// it is intended to be launched from host.
+ /// it is intended to be launched from host. Also accepts legacy discardable
+ /// `gpu.kernel` for compatibility with older generic IR.
bool isKernel() {
+ if (getKernel())
+ return true;
return (*this)->getAttrOfType<UnitAttr>(
GPUDialect::getKernelFuncAttrName()) != nullptr;
}
/// Returns the number of buffers located in the workgroup memory.
unsigned getNumWorkgroupAttributions() {
- auto attr = (*this)->getAttrOfType<IntegerAttr>(
- getNumWorkgroupAttributionsAttrName());
- if (!attr)
+ std::optional<int64_t> v = getWorkgroupAttributions();
+ if (!v)
return 0;
- int64_t value = attr.getInt();
- assert(value >= 0 && value < std::numeric_limits<uint32_t>::max() &&
+ int64_t value = *v;
+ assert(value < std::numeric_limits<uint32_t>::max() &&
"invalid workgroup attribution count");
- return value;
+ return static_cast<unsigned>(value);
}
/// Return the index of the first workgroup attribution in the block argument
@@ -466,7 +470,7 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
/// Returns a list of block arguments that correspond to buffers located in
/// the workgroup memory
- ArrayRef<BlockArgument> getWorkgroupAttributions() {
+ ArrayRef<BlockArgument> getWorkgroupAttributionBBArgs() {
auto begin =
std::next(getBody().args_begin(), getFirstWorkgroupAttributionIndex());
auto end = std::next(begin, getNumWorkgroupAttributions());
@@ -548,12 +552,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
return setPrivateAttributionAttr(index, StringAttr::get((*this)->getContext(), name), value);
}
- /// Returns the name of the attribute containing the number of buffers
- /// located in the workgroup memory.
- static StringRef getNumWorkgroupAttributionsAttrName() {
- return "workgroup_attributions";
- }
-
/// Returns the argument types of this function.
ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
@@ -806,7 +804,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
Optional<Index>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize,
OptionalAttr<FlatSymbolRefAttr>:$module,
- OptionalAttr<FlatSymbolRefAttr>:$function)>,
+ OptionalAttr<FlatSymbolRefAttr>:$function,
+ OptionalAttr<ConfinedAttr<I64Attr, [IntNonNegative]>>:$workgroup_attributions)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "GPU kernel launch operation";
@@ -1004,19 +1003,17 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// Returns the number of buffers located in the workgroup memory.
unsigned getNumWorkgroupAttributions() {
- auto attr = (*this)->getAttrOfType<IntegerAttr>(
- getNumWorkgroupAttributionsAttrName());
- if (!attr)
+ std::optional<int64_t> v = getWorkgroupAttributions();
+ if (!v)
return 0;
- int64_t value = attr.getInt();
- assert(value >= 0 && value < std::numeric_limits<uint32_t>::max() &&
+ int64_t value = *v;
+ assert(value < std::numeric_limits<uint32_t>::max() &&
"invalid workgroup attribution count");
- return value;
+ return static_cast<unsigned>(value);
}
- /// Returns a list of block arguments that correspond to buffers located in
- /// the workgroup memory
- ArrayRef<BlockArgument> getWorkgroupAttributions() {
+ /// Block arguments for workgroup memory buffers
+ ArrayRef<BlockArgument> getWorkgroupAttributionBBArgs() {
auto begin =
std::next(getBody().args_begin(), getNumConfigRegionAttributes());
auto end = std::next(begin, getNumWorkgroupAttributions());
@@ -1047,12 +1044,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// Adds a new block argument that corresponds to buffers located in
/// private memory.
BlockArgument addPrivateAttribution(Type type, Location loc);
-
- /// Returns the name of the attribute containing the number of buffers
- /// located in the workgroup memory.
- static StringRef getNumWorkgroupAttributionsAttrName() {
- return "workgroup_attributions";
- }
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 88abc4400c9b7..bd7bc47217bdc 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -19,6 +19,7 @@
#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -77,55 +78,6 @@ static void filterFuncAttributes(FunctionOpInterface func,
}
}
-/// Add custom lowered funcOp to llvm.func attributes here.
-struct LoweredFuncAttrs {
- LLVM::LLVMFuncOp::Properties properties;
- NamedAttrList discardableAttrs;
-};
-
-/// Lower discardable function attributes on `func.func` to attributes expected
-/// by `llvm.func`.
-static FailureOr<LoweredFuncAttrs>
-lowerFuncAttributes(FunctionOpInterface func) {
- MLIRContext *ctx = func->getContext();
- LoweredFuncAttrs lowered;
-
- llvm::SmallDenseSet<StringRef> odsAttrNames(
- LLVM::LLVMFuncOp::getAttributeNames().begin(),
- LLVM::LLVMFuncOp::getAttributeNames().end());
-
- NamedAttrList inherentAttrs;
-
- for (const NamedAttribute &attr : func->getDiscardableAttrs()) {
- StringRef attrName = attr.getName().strref();
-
- if (odsAttrNames.contains(attrName)) {
- LDBG() << "LLVM specific attributes: " << attrName
- << "should use llvm.* prefix, discarding it";
- continue;
- }
-
- StringRef inherent = attrName;
- if (inherent.consume_front("llvm.") && odsAttrNames.contains(inherent))
- inherentAttrs.set(inherent, attr.getValue()); // collect inherent attrs
- else
- lowered.discardableAttrs.push_back(attr);
- }
-
- // Convert collected inherent attrs into typed properties.
- if (!inherentAttrs.empty()) {
- DictionaryAttr dict = inherentAttrs.getDictionary(ctx);
- auto emitError = [&] {
- return func.emitOpError("invalid llvm.func property");
- };
- if (failed(LLVM::LLVMFuncOp::setPropertiesFromAttr(lowered.properties, dict,
- emitError))) {
- return failure();
- }
- }
- return lowered;
-}
-
static void buildLLVMFuncProperties(PatternRewriter &rewriter,
FunctionOpInterface srcFunc,
Type llvmFuncType,
@@ -385,7 +337,8 @@ FailureOr<LLVM::LLVMFuncOp> mlir::convertFuncOpToLLVMFuncOp(
return funcOp.emitError("C interface for variadic functions is not "
"supported yet.");
- FailureOr<LoweredFuncAttrs> loweredAttrs = lowerFuncAttributes(funcOp);
+ FailureOr<LowerLLVMFuncAttrs> loweredAttrs =
+ lowerDiscardableAttrsForLLVMFunc(funcOp);
if (failed(loweredAttrs))
return rewriter.notifyMatchFailure(funcOp,
"failed to lower func attributes");
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a705ebab7aa4..fc4643d573a3b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -14,10 +14,14 @@
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/DebugLog.h"
#include "llvm/Support/FormatVariadic.h"
+#define DEBUG_TYPE "gpu-lowering"
+
using namespace mlir;
LLVM::LLVMFuncOp mlir::getOrDefineFunction(Operation *moduleOp, Location loc,
@@ -74,6 +78,55 @@ LLVM::GlobalOp mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
name, attr, alignment, addrSpace);
}
+FailureOr<LowerLLVMFuncAttrs> GPUFuncOpLowering::buildLoweredGPULLVMFuncAttrs(
+ gpu::GPUFuncOp gpuFuncOp, Type llvmFuncType, OpBuilder &rewriter) const {
+ FailureOr<LowerLLVMFuncAttrs> loweredAttrs =
+ lowerDiscardableAttrsForLLVMFunc(gpuFuncOp);
+ if (failed(loweredAttrs))
+ return failure();
+
+ MLIRContext *ctx = rewriter.getContext();
+ LLVM::LLVMFuncOp::Properties &props = loweredAttrs->properties;
+ props.sym_name = rewriter.getStringAttr(gpuFuncOp.getName());
+ props.function_type = TypeAttr::get(llvmFuncType);
+ const bool isKernelFunc = gpuFuncOp.isKernel();
+ props.setCConv(LLVM::CConvAttr::get(ctx, isKernelFunc
+ ? kernelCallingConvention
+ : nonKernelCallingConvention));
+
+ NamedAttrList &discardable = loweredAttrs->discardableAttrs;
+ auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
+
+ auto appendIfNameAndValue = [&](StringAttr name, Attribute value) {
+ if (name && value)
+ discardable.append(name, value);
+ };
+
+ DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
+ DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+ DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
+
+ appendIfNameAndValue(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
+ knownBlockSize);
+ appendIfNameAndValue(gpuDialect->getKnownGridSizeAttrHelper().getName(),
+ knownGridSize);
+ appendIfNameAndValue(gpuDialect->getKnownClusterSizeAttrHelper().getName(),
+ knownClusterSize);
+
+ if (isKernelFunc) {
+ discardable.append(gpuDialect->getKernelFuncAttrName(),
+ rewriter.getUnitAttr());
+ // Add a dialect specific kernel attribute in addition to GPU kernel
+ // attribute. The former is necessary for further translation while the
+ // latter is expected by gpu.launch_func.
+ appendIfNameAndValue(kernelAttributeName, rewriter.getUnitAttr());
+ appendIfNameAndValue(kernelBlockSizeAttributeName, knownBlockSize);
+ appendIfNameAndValue(kernelClusterSizeAttributeName, knownClusterSize);
+ }
+
+ return loweredAttrs;
+}
+
LogicalResult
GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -85,7 +138,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// workgroup attributions.
ArrayRef<BlockArgument> workgroupAttributions =
- gpuFuncOp.getWorkgroupAttributions();
+ gpuFuncOp.getWorkgroupAttributionBBArgs();
size_t numAttributions = workgroupAttributions.size();
// Insert all arguments at the end.
@@ -136,7 +189,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
} else {
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto [idx, attribution] :
- llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+ llvm::enumerate(gpuFuncOp.getWorkgroupAttributionBBArgs())) {
auto type = dyn_cast<MemRefType>(attribution.getType());
assert(type && type.hasStaticShape() && "unexpected type in attribution");
@@ -174,67 +227,17 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
});
}
- // Create the new function operation. Only copy those attributes that are
- // not specific to function modeling.
- SmallVector<NamedAttribute, 4> attributes;
- ArrayAttr argAttrs;
- for (const auto &attr : gpuFuncOp->getAttrs()) {
- if (attr.getName() == SymbolTable::getSymbolAttrName() ||
- attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
- attr.getName() ==
- gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
- attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
- attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
- attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
- attr.getName() == gpuFuncOp.getKnownGridSizeAttrName() ||
- attr.getName() == gpuFuncOp.getKnownClusterSizeAttrName())
- continue;
- if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
- argAttrs = gpuFuncOp.getArgAttrsAttr();
- continue;
- }
- attributes.push_back(attr);
- }
+ ArrayAttr argAttrs = gpuFuncOp.getArgAttrsAttr();
- DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
- DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
- DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
- // Ensure we don't lose information if the function is lowered before its
- // surrounding context.
- auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
- if (knownBlockSize)
- attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
- knownBlockSize);
- if (knownGridSize)
- attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
- knownGridSize);
- if (knownClusterSize)
- attributes.emplace_back(
- gpuDialect->getKnownClusterSizeAttrHelper().getName(),
- knownClusterSize);
-
- // Add a dialect specific kernel attribute in addition to GPU kernel
- // attribute. The former is necessary for further translation while the
- // latter is expected by gpu.launch_func.
- if (gpuFuncOp.isKernel()) {
- if (kernelAttributeName)
- attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
- // Set the dialect-specific block size attribute if there is one.
- if (kernelBlockSizeAttributeName && knownBlockSize) {
- attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
- }
- // Set the dialect-specific cluster size attribute if there is one.
- if (kernelClusterSizeAttributeName && knownClusterSize) {
- attributes.emplace_back(kernelClusterSizeAttributeName, knownClusterSize);
- }
- }
- LLVM::CConv callingConvention = gpuFuncOp.isKernel()
- ? kernelCallingConvention
- : nonKernelCallingConvention;
- auto llvmFuncOp = LLVM::LLVMFuncOp::create(
- rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
- LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
- /*comdat=*/nullptr, attributes);
+ FailureOr<LowerLLVMFuncAttrs> loweredAttrs =
+ buildLoweredGPULLVMFuncAttrs(gpuFuncOp, funcType, rewriter);
+ if (failed(loweredAttrs))
+ return rewriter.notifyMatchFailure(gpuFuncOp,
+ "failed to lower func attributes");
+
+ auto llvmFuncOp = LLVM::LLVMFuncOp::create(rewriter, gpuFuncOp.getLoc(),
+ loweredAttrs->properties,
+ loweredAttrs->discardableAttrs);
{
// Insert operations that correspond to converted workgroup and private
@@ -260,8 +263,9 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
ArrayRef<BlockArgument> attributionArguments =
gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
numAttributions);
- for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
- gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
+ for (auto [idx, vals] : llvm::enumerate(
+ llvm::zip_equal(gpuFuncOp.getWorkgroupAttributionBBArgs(),
+ attributionArguments))) {
auto [attribution, arg] = vals;
auto type = cast<MemRefType>(attribution.getType());
@@ -287,7 +291,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
- Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
+ Value attribution = gpuFuncOp.getWorkgroupAttributionBBArgs()[idx];
auto type = cast<MemRefType>(attribution.getType());
Value descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a3b2e04c35313..7aa1d8da233a4 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -8,6 +8,7 @@
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
+#include "mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
@@ -106,6 +107,12 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
+ /// Lower discardable attrs like `func` lowering, then set `llvm.func`
+ /// properties and append GPU / target-specific discardable metadata.
+ FailureOr<LowerLLVMFuncAttrs>
+ buildLoweredGPULLVMFuncAttrs(gpu::GPUFuncOp gpuFuncOp, Type llvmFuncType,
+ OpBuilder &rewriter) const;
+
private:
/// The address space to use for `alloca`s in private memory.
unsigned allocaAddrSpace;
diff --git a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
index 568d9339aaabc..77f5013764d96 100644
--- a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_conversion_library(MLIRLLVMCommonConversion
ConversionTarget.cpp
+ LowerFunctionAttrsToLLVM.cpp
LoweringOptions.cpp
MemRefBuilder.cpp
Pattern.cpp
diff --git a/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp b/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
new file mode 100644
index 0000000000000..b9d5dbfbe9008
--- /dev/null
+++ b/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
@@ -0,0 +1,63 @@
+//===- LowerFunctionAttrsToLLVM.cpp - Lower func attrs to llvm.func -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/DebugLog.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "lower-function-attrs-to-llvm"
+
+FailureOr<LowerLLVMFuncAttrs>
+mlir::lowerDiscardableAttrsForLLVMFunc(FunctionOpInterface func) {
+ MLIRContext *ctx = func->getContext();
+ LowerLLVMFuncAttrs lowered;
+
+ llvm::SmallDenseSet<StringRef> odsAttrNames(
+ LLVM::LLVMFuncOp::getAttributeNames().begin(),
+ LLVM::LLVMFuncOp::getAttributeNames().end());
+
+ NamedAttrList inherentAttrs;
+
+ for (const NamedAttribute &attr : func->getDiscardableAttrs()) {
+ StringRef attrName = attr.getName().strref();
+
+ // With properties-backed ops, `getDiscardableAttrs()` yields every entry in
+ // the raw attribute dictionary, which should only hold discardable
+ // attributes but can still contain duplicate keys for inherent attributes.
+ // Never forward those to llvm.func.
+ if (func->getInherentAttr(attrName).has_value())
+ continue;
+
+ if (odsAttrNames.contains(attrName)) {
+ LDBG() << "LLVM specific attributes: " << attrName
+ << "should use llvm.* prefix, discarding it";
+ continue;
+ }
+
+ StringRef inherent = attrName;
+ if (inherent.consume_front("llvm.") && odsAttrNames.contains(inherent))
+ inherentAttrs.set(inherent, attr.getValue()); // collect inherent attrs
+ else
+ lowered.discardableAttrs.push_back(attr);
+ }
+
+ // Convert collected inherent attrs into typed properties.
+ if (!inherentAttrs.empty()) {
+ DictionaryAttr dict = inherentAttrs.getDictionary(ctx);
+ auto emitError = [&] {
+ return func.emitOpError("invalid llvm.func property");
+ };
+ if (failed(LLVM::LLVMFuncOp::setPropertiesFromAttr(lowered.properties, dict,
+ emitError))) {
+ return failure();
+ }
+ }
+ return lowered;
+}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index aff6bce57b5a0..17e7a8be5c2c9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -42,6 +42,7 @@
#include "llvm/Support/StringSaver.h"
#include <cassert>
#include <numeric>
+#include <optional>
using namespace mlir;
using namespace mlir::gpu;
@@ -249,8 +250,10 @@ bool GPUDialect::hasWorkgroupMemoryAddressSpace(MemRefType type) {
}
bool GPUDialect::isKernel(Operation *op) {
- UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
- return static_cast<bool>(isKernelAttr);
+ if (auto gpuFunc = dyn_cast<GPUFuncOp>(op))
+ return gpuFunc.isKernel();
+ return static_cast<bool>(
+ op->getAttrOfType<UnitAttr>(getKernelFuncAttrName()));
}
namespace {
@@ -697,10 +700,10 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
FlatSymbolRefAttr module, FlatSymbolRefAttr function) {
OpBuilder::InsertionGuard g(builder);
- // Add a WorkGroup attribution attribute. This attribute is required to
- // identify private attributions in the list of block argguments.
- result.addAttribute(getNumWorkgroupAttributionsAttrName(),
- builder.getI64IntegerAttr(workgroupAttributions.size()));
+ if (!workgroupAttributions.empty())
+ result.addAttribute(
+ getWorkgroupAttributionsAttrName(result.name),
+ builder.getI64IntegerAttr(workgroupAttributions.size()));
// Add Op operands.
result.addOperands(asyncDependencies);
@@ -826,7 +829,7 @@ LogicalResult LaunchOp::verifyRegions() {
}
// Verify Attributions Address Spaces.
- if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
+ if (failed(verifyAttributions(getOperation(), getWorkgroupAttributionBBArgs(),
GPUDialect::getWorkgroupAddressSpace())) ||
failed(verifyAttributions(getOperation(), getPrivateAttributions(),
GPUDialect::getPrivateAddressSpace())))
@@ -905,7 +908,7 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << ')';
}
- printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions());
+ printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributionBBArgs());
printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
p << ' ';
@@ -913,7 +916,7 @@ void LaunchOp::print(OpAsmPrinter &p) {
p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
LaunchOp::getOperandSegmentSizeAttr(),
- getNumWorkgroupAttributionsAttrName(),
+ getWorkgroupAttributionsAttrName(),
moduleAttrName, functionAttrName});
}
@@ -1056,12 +1059,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
return failure();
}
- // Create the region arguments, it has kNumConfigRegionAttributes arguments
- // that correspond to block/thread identifiers and grid/block sizes, all
- // having `index` type, a variadic number of WorkGroup Attributions and
- // a variadic number of Private Attributions. The number of WorkGroup
- // Attributions is stored in the attr with name:
- // LaunchOp::getNumWorkgroupAttributionsAttrName().
+ // Create the region arguments: fixed launch-config args (`index`), then
+ // workgroup / private attribution args. The workgroup count is stored in the
+ // inherent `workgroup_attributions` attribute when non-zero.
Type index = parser.getBuilder().getIndexType();
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
LaunchOp::kNumConfigRegionAttributes + 6, index);
@@ -1085,8 +1085,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
unsigned numWorkgroupAttrs = regionArguments.size() -
LaunchOp::kNumConfigRegionAttributes -
(hasCluster ? 6 : 0);
- result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
- builder.getI64IntegerAttr(numWorkgroupAttrs));
+ if (numWorkgroupAttrs != 0)
+ result.addAttribute(LaunchOp::getWorkgroupAttributionsAttrName(result.name),
+ builder.getI64IntegerAttr(numWorkgroupAttrs));
// Parse private memory attributions.
if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(),
@@ -1160,12 +1161,10 @@ void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
/// Adds a new block argument that corresponds to buffers located in
/// workgroup memory.
BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
- auto attrName = getNumWorkgroupAttributionsAttrName();
- auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
- (*this)->setAttr(attrName,
- IntegerAttr::get(attr.getType(), attr.getValue() + 1));
+ int64_t cur = getWorkgroupAttributions().value_or(0);
+ setWorkgroupAttributions(std::optional<int64_t>(cur + 1));
return getBody().insertArgument(
- LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
+ getNumConfigRegionAttributes() + static_cast<unsigned>(cur), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
@@ -1375,8 +1374,7 @@ LaunchFuncOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
return diag;
}
- if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
- GPUDialect::getKernelFuncAttrName()))
+ if (!GPUDialect::isKernel(kernelFunc))
return launchOp.emitOpError("kernel function is missing the '")
<< GPUDialect::getKernelFuncAttrName() << "' attribute";
@@ -1560,12 +1558,10 @@ void BarrierOp::build(OpBuilder &builder, OperationState &odsState,
/// Adds a new block argument that corresponds to buffers located in
/// workgroup memory.
BlockArgument GPUFuncOp::addWorkgroupAttribution(Type type, Location loc) {
- auto attrName = getNumWorkgroupAttributionsAttrName();
- auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
- (*this)->setAttr(attrName,
- IntegerAttr::get(attr.getType(), attr.getValue() + 1));
+ int64_t cur = getWorkgroupAttributions().value_or(0);
+ setWorkgroupAttributions(std::optional<int64_t>(cur + 1));
return getBody().insertArgument(
- getFunctionType().getNumInputs() + attr.getInt(), type, loc);
+ getFunctionType().getNumInputs() + static_cast<unsigned>(cur), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
@@ -1587,7 +1583,7 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
builder.getStringAttr(name));
result.addAttribute(getFunctionTypeAttrName(result.name),
TypeAttr::get(type));
- result.addAttribute(getNumWorkgroupAttributionsAttrName(),
+ result.addAttribute(getWorkgroupAttributionsAttrName(result.name),
builder.getI64IntegerAttr(workgroupAttributions.size()));
result.addAttributes(attrs);
Region *body = result.addRegion();
@@ -1696,8 +1692,10 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
unsigned numWorkgroupAttrs = entryArgs.size() - type.getNumInputs();
- result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(),
- builder.getI64IntegerAttr(numWorkgroupAttrs));
+ if (numWorkgroupAttrs != 0)
+ result.addAttribute(
+ GPUFuncOp::getWorkgroupAttributionsAttrName(result.name),
+ builder.getI64IntegerAttr(numWorkgroupAttrs));
if (workgroupAttributionAttrs)
result.addAttribute(GPUFuncOp::getWorkgroupAttribAttrsAttrName(result.name),
workgroupAttributionAttrs);
@@ -1713,7 +1711,7 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
// Parse the kernel attribute if present.
if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword())))
- result.addAttribute(GPUDialect::getKernelFuncAttrName(),
+ result.addAttribute(GPUFuncOp::getKernelAttrName(result.name),
builder.getUnitAttr());
// Parse attributes.
@@ -1735,7 +1733,7 @@ void GPUFuncOp::print(OpAsmPrinter &p) {
/*isVariadic=*/false,
type.getResults());
- printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions(),
+ printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributionBBArgs(),
getWorkgroupAttribAttrs().value_or(nullptr));
printAttributions(p, getPrivateKeyword(), getPrivateAttributions(),
getPrivateAttribAttrs().value_or(nullptr));
@@ -1744,7 +1742,7 @@ void GPUFuncOp::print(OpAsmPrinter &p) {
function_interface_impl::printFunctionAttributes(
p, *this,
- {getNumWorkgroupAttributionsAttrName(),
+ {getWorkgroupAttributionsAttrName(), getKernelAttrName(),
GPUDialect::getKernelFuncAttrName(), getFunctionTypeAttrName(),
getArgAttrsAttrName(), getResAttrsAttrName(),
getWorkgroupAttribAttrsAttrName(), getPrivateAttribAttrsAttrName()});
@@ -1898,7 +1896,7 @@ LogicalResult GPUFuncOp::verifyBody() {
<< blockArgType;
}
- if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
+ if (failed(verifyAttributions(getOperation(), getWorkgroupAttributionBBArgs(),
GPUDialect::getWorkgroupAddressSpace())) ||
failed(verifyAttributions(getOperation(), getPrivateAttributions(),
GPUDialect::getPrivateAddressSpace())))
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index b9529b0d067f2..d2e74dce23078 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -197,10 +197,9 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
auto outlinedFunc = gpu::GPUFuncOp::create(
builder, loc, kernelFnName, type,
- TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
+ TypeRange(ValueRange(launchOp.getWorkgroupAttributionBBArgs())),
TypeRange(ValueRange(launchOp.getPrivateAttributions())));
- outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
- builder.getUnitAttr());
+ outlinedFunc.setKernel(true);
// If we can infer bounds on the grid and/or block sizes from the arguments
// to the launch op, propagate them to the generated kernel. This is safe
@@ -227,8 +226,8 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
for (const auto &[launchArg, funcArg] :
- llvm::zip(launchOp.getWorkgroupAttributions(),
- outlinedFunc.getWorkgroupAttributions()))
+ llvm::zip(launchOp.getWorkgroupAttributionBBArgs(),
+ outlinedFunc.getWorkgroupAttributionBBArgs()))
map.map(launchArg, funcArg);
for (const auto &[launchArg, funcArg] :
llvm::zip(launchOp.getPrivateAttributions(),
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 0bd1d34c3504b..f6151ac68b192 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -82,8 +82,7 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
auto gpuFunc =
gpu::GPUFuncOp::create(builder, gpuModule->getLoc(), kernelName, type);
- gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
- builder.getUnitAttr());
+ gpuFunc.setKernel(true);
return gpuFunc;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ecdf253d68182..f156580824c93 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -173,7 +173,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
gpuFuncOp, "expected gpu.func terminator to be gpu.return");
// Create a new function with the same signature and same attributes.
SmallVector<Type> workgroupAttributionsTypes =
- llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
+ llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributionBBArgs(),
[](BlockArgument arg) { return arg.getType(); });
SmallVector<Type> privateAttributionsTypes =
llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
>From fec0dad4186e0e732552ee5606cfc9400b2d8452 Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 30 Mar 2026 18:50:27 +0800
Subject: [PATCH 2/2] [mlir][conversion] Remove unnecessary check
---
.../lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp | 7 -------
1 file changed, 7 deletions(-)
diff --git a/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp b/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
index b9d5dbfbe9008..eee7a96b369fd 100644
--- a/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/LowerFunctionAttrsToLLVM.cpp
@@ -28,13 +28,6 @@ mlir::lowerDiscardableAttrsForLLVMFunc(FunctionOpInterface func) {
for (const NamedAttribute &attr : func->getDiscardableAttrs()) {
StringRef attrName = attr.getName().strref();
- // With properties-backed ops, `getDiscardableAttrs()` yields every entry in
- // the raw attribute dictionary, which should only hold discardable
- // attributes but can still contain duplicate keys for inherent attributes.
- // Never forward those to llvm.func.
- if (func->getInherentAttr(attrName).has_value())
- continue;
-
if (odsAttrNames.contains(attrName)) {
LDBG() << "LLVM specific attributes: " << attrName
<< "should use llvm.* prefix, discarding it";
More information about the flang-commits
mailing list