[Mlir-commits] [mlir] [mlir][gpu] Use `known_block_size` to set `maxntid` for NVVM target (PR #77301)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Jan 8 04:46:12 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Guray Ozen (grypp)
<details>
<summary>Changes</summary>
Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation.
MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
---
Full diff: https://github.com/llvm/llvm-project/pull/77301.diff
4 Files Affected:
- (modified) mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp (+19-1)
- (modified) mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h (+9-4)
- (modified) mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp (+3-1)
- (modified) mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir (+9)
``````````diff
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a005e67ca95ba..eeb8fbbb180bad 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
- if (gpuFuncOp.isKernel())
+ if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+
+ // Set the block size attribute if it is present.
+ if (kernelBlockSizeAttributeName.has_value()) {
+ std::optional<int32_t> dimX =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
+ std::optional<int32_t> dimY =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
+ std::optional<int32_t> dimZ =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
+ if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
+ // If any of the dimensions are missing, fill them in with 1.
+ attributes.emplace_back(
+ kernelBlockSizeAttributeName.value(),
+ rewriter.getI32ArrayAttr(
+ {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
+ }
+ }
+ }
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a77db4a036bad3..471a688e85463e 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -36,13 +36,15 @@ struct GPUDynamicSharedMemoryOpLowering
};
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
- GPUFuncOpLowering(const LLVMTypeConverter &converter,
- unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
- StringAttr kernelAttributeName)
+ GPUFuncOpLowering(
+ const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+ unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
+ std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
- kernelAttributeName(kernelAttributeName) {}
+ kernelAttributeName(kernelAttributeName),
+ kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;
+
+ /// The attribute name to to set block size
+ std::optional<StringAttr> kernelBlockSizeAttributeName;
};
/// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e60fe5cbd7603f..a7ac2332961ae2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getKernelFuncAttrName()));
+ NVVM::NVVMDialect::getKernelFuncAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getMaxntidAttrName()));
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
"__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 20a200e812c125..c7f1d4f124c186 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
}
}
+gpu.module @gpumodule {
+// CHECK-LABEL: func @kernel_with_block_size()
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]}
+ gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+ gpu.return
+ }
+}
+
+
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
``````````
</details>
https://github.com/llvm/llvm-project/pull/77301
More information about the Mlir-commits
mailing list