[Mlir-commits] [mlir] [MLIR][GPU] Generalize gpu.printf op lowering to LLVM call pattern. (PR #164297)
Sang Ik Lee
llvmlistbot at llvm.org
Tue Oct 21 10:29:25 PDT 2025
https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/164297
>From fcd98013cfe99eea6a0e1b9fad08ba4b40ffdb2b Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 17 Oct 2025 20:39:47 +0000
Subject: [PATCH 1/2] Generalize gpu.printf op too llvm call lowering pattern
for usage cases other than AMD gpu OpenCL runtime.
---
.../Conversion/GPUCommon/GPUOpsLowering.cpp | 6 ++--
.../lib/Conversion/GPUCommon/GPUOpsLowering.h | 15 +++++++---
.../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 5 +++-
mlir/test/Conversion/GPUToLLVMSPV/printf.mlir | 16 ++++++++++
.../Dialect/XeVM/GPU/gpu_printf.mlir | 29 +++++++++++++++++++
5 files changed, 64 insertions(+), 7 deletions(-)
create mode 100644 mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 2285d2695db4e..eb662a1b056de 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -507,7 +507,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
/*isVarArg=*/true);
LLVM::LLVMFuncOp printfDecl =
- getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
+ getOrDefineFunction(moduleOp, loc, rewriter, funcName, printfType);
+ printfDecl.setCConv(callingConvention);
// Create the global op or find an existing one.
LLVM::GlobalOp global = getOrCreateStringConstant(
@@ -530,7 +531,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
printfArgs.push_back(stringStart);
printfArgs.append(argsRange.begin(), argsRange.end());
- LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
+ auto call = LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
+ call.setCConv(callingConvention);
rewriter.eraseOp(gpuPrintfOp);
return success();
}
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 66d3bb40a8f5a..adf5ba2feb591 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -10,6 +10,7 @@
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
namespace mlir {
@@ -142,13 +143,17 @@ struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern<gpu::PrintfOp> {
/// This pass will add a declaration of printf() to the GPUModule if needed
/// and separate out the format strings into global constants. For some
/// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler
-/// will lower printf calls to appropriate device-side code
+/// will lower printf calls to appropriate device-side code.
+/// callingConvention and funcName can be adjusted as needed.
struct GPUPrintfOpToLLVMCallLowering
: public ConvertOpToLLVMPattern<gpu::PrintfOp> {
- GPUPrintfOpToLLVMCallLowering(const LLVMTypeConverter &converter,
- int addressSpace = 0)
+ GPUPrintfOpToLLVMCallLowering(
+ const LLVMTypeConverter &converter, int addressSpace = 0,
+ LLVM::cconv::CConv callingConvention = LLVM::cconv::CConv::C,
+ StringRef funcName = "printf")
: ConvertOpToLLVMPattern<gpu::PrintfOp>(converter),
- addressSpace(addressSpace) {}
+ addressSpace(addressSpace), callingConvention(callingConvention),
+ funcName(funcName) {}
LogicalResult
matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
@@ -156,6 +161,8 @@ struct GPUPrintfOpToLLVMCallLowering
private:
int addressSpace;
+ LLVM::cconv::CConv callingConvention;
+ StringRef funcName;
};
/// Lowering of gpu.printf to a vprintf standard library.
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index c2363a1a40294..29437f1ae5c0c 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -470,10 +470,13 @@ struct GPUToLLVMSPVConversionPass final
gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp,
gpu::ShuffleOp, gpu::SubgroupIdOp, gpu::SubgroupSizeOp,
- gpu::ThreadIdOp>();
+ gpu::ThreadIdOp, gpu::PrintfOp>();
populateGpuToLLVMSPVConversionPatterns(converter, patterns);
populateGpuMemorySpaceAttributeConversions(converter);
+ patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/2,
+ LLVM::cconv::CConv::SPIR_FUNC,
+ "Z6printfPU3AS2Kcz");
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
new file mode 100644
index 0000000000000..1b17da9f4eeee
--- /dev/null
+++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s
+
+gpu.module @test_module {
+ // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32}
+ // CHECK: llvm.func spir_funccc @Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32
+ // CHECK-LABEL: llvm.func spir_funccc @test_printf
+ // CHECK: (%[[ARG0:.*]]: i32)
+ gpu.func @test_printf(%arg0: i32) {
+ // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2>
+ // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8>
+ // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<2>, ...)>) : (!llvm.ptr<2>, i32) -> i32
+ gpu.printf "Hello: %d\n", %arg0 : i32
+ gpu.return
+ }
+}
+
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
new file mode 100644
index 0000000000000..f9c305b04207b
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @test attributes {gpu.container_module} {
+ gpu.module @test_module {
+ gpu.func @test_printf(%arg0: i32, %arg1: f32) kernel {
+ gpu.printf "Hello: %d\n", %arg0 : i32
+ gpu.printf "Hello: %f\n", %arg1 : f32
+ gpu.return
+ }
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c11 = arith.constant 11 : i32
+ %c4 = arith.constant 4.0 : f32
+ // CHECK: "Hello: 11"
+ gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32)
+ return
+ }
+}
>From 433d89eb792f36ee17fa4720601c4f2c48e21105 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Mon, 20 Oct 2025 18:25:26 +0000
Subject: [PATCH 2/2] Fix incorrect function name.
---
mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 2 +-
mlir/test/Conversion/GPUToLLVMSPV/printf.mlir | 4 ++--
mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir | 3 ++-
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 29437f1ae5c0c..25f1e1b184d61 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -476,7 +476,7 @@ struct GPUToLLVMSPVConversionPass final
populateGpuMemorySpaceAttributeConversions(converter);
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/2,
LLVM::cconv::CConv::SPIR_FUNC,
- "Z6printfPU3AS2Kcz");
+ "_Z6printfPU3AS2Kcz");
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
index 1b17da9f4eeee..74017e8354cf1 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
@@ -2,13 +2,13 @@
gpu.module @test_module {
// CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32}
- // CHECK: llvm.func spir_funccc @Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32
+ // CHECK: llvm.func spir_funccc @_Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32
// CHECK-LABEL: llvm.func spir_funccc @test_printf
// CHECK: (%[[ARG0:.*]]: i32)
gpu.func @test_printf(%arg0: i32) {
// CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2>
// CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8>
- // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<2>, ...)>) : (!llvm.ptr<2>, i32) -> i32
+ // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @_Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<2>, ...)>) : (!llvm.ptr<2>, i32) -> i32
gpu.printf "Hello: %d\n", %arg0 : i32
gpu.return
}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
index f9c305b04207b..edf8775c72418 100644
--- a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
@@ -22,7 +22,8 @@ module @test attributes {gpu.container_module} {
%c1 = arith.constant 1 : index
%c11 = arith.constant 11 : i32
%c4 = arith.constant 4.0 : f32
- // CHECK: "Hello: 11"
+ // CHECK: Hello: 11
+ // CHECK: Hello: 4.000000
gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32)
return
}
More information about the Mlir-commits
mailing list