[llvm] [Offload] Have `doJITPostProcessing` accept multiple binaries (PR #148608)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 14 03:49:53 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/148608
>From c4af9152963d64ca87c7928d595f93c4291766b6 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 14 Jul 2025 11:44:11 +0100
Subject: [PATCH 1/2] [Offload] Have `doJITPostProcessing` accept multiple
binaries
The device handler for JIT processing images now accepts a list of
buffers rather than just one. Devices are expected to either link them
all into a single binary, or return an error code if they don't support
linking.
Currently, only amdgpu supports multiple binaries.
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 67 ++++++++++---------
offload/plugins-nextgen/common/include/JIT.h | 2 +-
.../common/include/PluginInterface.h | 8 ++-
offload/plugins-nextgen/common/src/JIT.cpp | 7 +-
offload/plugins-nextgen/cuda/src/rtl.cpp | 14 ++--
5 files changed, 57 insertions(+), 41 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..87ec1e7d01ead 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2066,31 +2066,35 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
- Expected<std::unique_ptr<MemoryBuffer>>
- doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
+ Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(
+ std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
// TODO: We should try to avoid materialization but there seems to be no
// good linker interface w/o file i/o.
- SmallString<128> LinkerInputFilePath;
- std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
- "o", LinkerInputFilePath);
- if (EC)
- return Plugin::error(ErrorCode::HOST_IO,
- "failed to create temporary file for linker");
-
- // Write the file's contents to the output file.
- Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
- FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
- if (!OutputOrErr)
- return OutputOrErr.takeError();
- std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
- llvm::copy(MB->getBuffer(), Output->getBufferStart());
- if (Error E = Output->commit())
- return std::move(E);
+ llvm::SmallVector<SmallString<128>> InputFilenames;
+ for (auto &B : MB) {
+ SmallString<128> LinkerInputFilePath;
+ auto &Dest = InputFilenames.emplace_back();
+ std::error_code EC =
+ sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "o", Dest);
+ if (EC)
+ return Plugin::error(ErrorCode::HOST_IO,
+ "failed to create temporary file for linker");
+
+ // Write the file's contents to the output file.
+ Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
+ FileOutputBuffer::create(Dest, B->getBuffer().size());
+ if (!OutputOrErr)
+ return OutputOrErr.takeError();
+ std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
+ llvm::copy(B->getBuffer(), Output->getBufferStart());
+ if (Error E = Output->commit())
+ return std::move(E);
+ }
SmallString<128> LinkerOutputFilePath;
- EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
- LinkerOutputFilePath);
+ std::error_code EC = sys::fs::createTemporaryFile(
+ "amdgpu-pre-link-jit", "so", LinkerOutputFilePath);
if (EC)
return Plugin::error(ErrorCode::HOST_IO,
"failed to create temporary file for linker");
@@ -2105,15 +2109,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
"Using `%s` to link JITed amdgcn output.", LLDPath.c_str());
std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
- StringRef Args[] = {LLDPath,
- "-flavor",
- "gnu",
- "--no-undefined",
- "-shared",
- MCPU,
- "-o",
- LinkerOutputFilePath.data(),
- LinkerInputFilePath.data()};
+ std::vector<StringRef> Args = {
+ LLDPath, "-flavor", "gnu", "--no-undefined",
+ "-shared", MCPU, "-o", LinkerOutputFilePath.data()};
+ for (auto &N : InputFilenames) {
+ Args.push_back(N);
+ }
std::string Error;
int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
@@ -2131,9 +2132,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (sys::fs::remove(LinkerOutputFilePath))
return Plugin::error(ErrorCode::HOST_IO,
"failed to remove temporary output file for lld");
- if (sys::fs::remove(LinkerInputFilePath))
- return Plugin::error(ErrorCode::HOST_IO,
- "failed to remove temporary input file for lld");
+ for (auto &N : InputFilenames) {
+ if (sys::fs::remove(N))
+ return Plugin::error(ErrorCode::HOST_IO,
+ "failed to remove temporary input file for lld");
+ }
return std::move(*BufferOrErr);
}
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index 8c530436a754b..fe4af5fd651a1 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -44,7 +44,7 @@ struct JITEngine {
/// called.
using PostProcessingFn =
std::function<Expected<std::unique_ptr<MemoryBuffer>>(
- std::unique_ptr<MemoryBuffer>)>;
+ std::vector<std::unique_ptr<MemoryBuffer>> &&)>;
JITEngine(Triple::ArchType TA);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..4cb7435bcd95f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -935,8 +935,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Post processing after jit backend. The ownership of \p MB will be taken.
virtual Expected<std::unique_ptr<MemoryBuffer>>
- doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
- return std::move(MB);
+ doJITPostProcessing(std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const {
+ if (MB.size() > 1)
+ return make_error<error::OffloadError>(
+ error::ErrorCode::UNSUPPORTED,
+ "Plugin does not support linking multiple binaries");
+ return std::move(MB[0]);
}
/// The minimum number of threads we use for a low-trip count combined loop.
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index c82a06e36d8f9..aed508e0d93ac 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -292,7 +292,9 @@ JITEngine::compile(const __tgt_device_image &Image,
if (!ObjMBOrErr)
return ObjMBOrErr.takeError();
- auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr));
+ std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
+ Buffers.push_back(std::move(*ObjMBOrErr));
+ auto ImageMBOrErr = PostProcessing(std::move(Buffers));
if (!ImageMBOrErr)
return ImageMBOrErr.takeError();
@@ -314,7 +316,8 @@ JITEngine::process(const __tgt_device_image &Image,
target::plugin::GenericDeviceTy &Device) {
const std::string &ComputeUnitKind = Device.getComputeUnitKind();
- PostProcessingFn PostProcessing = [&Device](std::unique_ptr<MemoryBuffer> MB)
+ PostProcessingFn PostProcessing =
+ [&Device](std::vector<std::unique_ptr<MemoryBuffer>> &&MB)
-> Expected<std::unique_ptr<MemoryBuffer>> {
return Device.doJITPostProcessing(std::move(MB));
};
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 15193de6ae430..e5f3c6214ae75 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -420,8 +420,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
- Expected<std::unique_ptr<MemoryBuffer>>
- doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
+ Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(
+ std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
+ // TODO: This should be possible, just needs to be implemented
+ if (MB.size() > 1)
+ return make_error<error::OffloadError>(
+ error::ErrorCode::UNIMPLEMENTED,
+ "CUDA plugin does not support linking multiple binaries");
+
// TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
// avoid the call to 'ptxas'.
SmallString<128> PTXInputFilePath;
@@ -433,11 +439,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
// Write the file's contents to the output file.
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
- FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
+ FileOutputBuffer::create(PTXInputFilePath, MB[0]->getBuffer().size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
- llvm::copy(MB->getBuffer(), Output->getBufferStart());
+ llvm::copy(MB[0]->getBuffer(), Output->getBufferStart());
if (Error E = Output->commit())
return std::move(E);
>From 736f4d404cf93128951dd1d542a3194c1888d22c Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 14 Jul 2025 11:49:38 +0100
Subject: [PATCH 2/2] Use SmallVector
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 +-
offload/plugins-nextgen/common/include/JIT.h | 2 +-
offload/plugins-nextgen/common/include/PluginInterface.h | 4 ++--
offload/plugins-nextgen/common/src/JIT.cpp | 2 +-
offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +-
5 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 87ec1e7d01ead..9e0ba87909001 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2067,7 +2067,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(
- std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
+ llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
// TODO: We should try to avoid materialization but there seems to be no
// good linker interface w/o file i/o.
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index fe4af5fd651a1..1d6280a0af141 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -44,7 +44,7 @@ struct JITEngine {
/// called.
using PostProcessingFn =
std::function<Expected<std::unique_ptr<MemoryBuffer>>(
- std::vector<std::unique_ptr<MemoryBuffer>> &&)>;
+ llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&)>;
JITEngine(Triple::ArchType TA);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 4cb7435bcd95f..7824257d28e1f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -934,8 +934,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual std::string getComputeUnitKind() const { return "unknown"; }
/// Post processing after jit backend. The ownership of \p MB will be taken.
- virtual Expected<std::unique_ptr<MemoryBuffer>>
- doJITPostProcessing(std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const {
+ virtual Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(
+ llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&MB) const {
if (MB.size() > 1)
return make_error<error::OffloadError>(
error::ErrorCode::UNSUPPORTED,
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index aed508e0d93ac..ab7d8f3c47ad0 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -317,7 +317,7 @@ JITEngine::process(const __tgt_device_image &Image,
const std::string &ComputeUnitKind = Device.getComputeUnitKind();
PostProcessingFn PostProcessing =
- [&Device](std::vector<std::unique_ptr<MemoryBuffer>> &&MB)
+ [&Device](llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&MB)
-> Expected<std::unique_ptr<MemoryBuffer>> {
return Device.doJITPostProcessing(std::move(MB));
};
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index e5f3c6214ae75..b916197bc5a6b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -421,7 +421,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(
- std::vector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
+ llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&MB) const override {
// TODO: This should be possible, just needs to be implemented
if (MB.size() > 1)
return make_error<error::OffloadError>(
More information about the llvm-commits
mailing list