[flang] [llvm] [flang-rt] Implement basic support for I/O from OpenMP GPU Offloading (PR #181039)

Mon Feb 16 15:59:58 PST 2026

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/181039

>From 6e82d6af19afa19fc9a6ba63cc3aed4c48f39774 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 11 Feb 2026 08:13:41 -0600
Subject: [PATCH] [flang-rt] Implement basic support for I/O from OpenMP GPU
 Offloading

Summary:
This PR provides the minimal support for Fortran I/O coming from a GPU
in OpenMP offloading. We use the same support the `libc` uses for its
printing through the RPC server. The helper functions `rpc::dispatch`
and `rpc::invoke` help make this mostly automatic.

Becaus Fortran I/O is not reentrant, the vast majority of complexity
comes from needing to stitch together calls from the GPU until they can
be executed all at once. This is needed not only because of the
limitations of recursive I/O, but without this the output would all be
interleaved because of the GPU's lock-step execution.

As such, the return values from the intermediate functions are
meaningless, all returning true. The final value is correct however. For
cookies we create a context pointer on the server to chain these
together.

Link in and use the runtime call from the RPC handler

update test

comments

forgot this one

And for the GPU code

Make libc dependency optional

Use IODEF right

Remove virtual method and fix potential leak
---
 flang-rt/cmake/modules/AddFlangRT.cmake       |   9 +-
 flang-rt/include/flang-rt/runtime/memory.h    |   8 +-
 flang-rt/lib/runtime/CMakeLists.txt           |   7 +
 flang-rt/lib/runtime/io-api-gpu.cpp           |  99 +++++++
 flang-rt/lib/runtime/io-api-gpu.h             |  82 ++++++
 flang-rt/lib/runtime/io-api-server.cpp        | 267 ++++++++++++++++++
 flang/include/flang/Runtime/io-api.h          |   2 +
 offload/plugins-nextgen/common/CMakeLists.txt |   4 +
 offload/plugins-nextgen/common/src/RPC.cpp    |  10 +
 offload/test/lit.cfg                          |   2 -
 offload/test/offloading/fortran/io.f90        |  27 ++
 runtimes/CMakeLists.txt                       |   2 +-
 12 files changed, 512 insertions(+), 7 deletions(-)
 create mode 100644 flang-rt/lib/runtime/io-api-gpu.cpp
 create mode 100644 flang-rt/lib/runtime/io-api-gpu.h
 create mode 100644 flang-rt/lib/runtime/io-api-server.cpp
 create mode 100644 offload/test/offloading/fortran/io.f90

diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index 923507764d691..3775fe8494634 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -122,6 +122,11 @@ function (add_flangrt_library name)
     list(APPEND extra_args EXCLUDE_FROM_ALL)
   endif ()
 
+  # Include the RPC utilities from the `libc` project.
+  if (TARGET llvm-libc-common-utilities)
+    set(extra_deps llvm-libc-common-utilities)
+  endif()
+
   # Also add header files to IDEs to list as part of the library.
   set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
 
@@ -139,11 +144,11 @@ function (add_flangrt_library name)
   endif ()
   if (build_static)
     add_library("${name_static}" STATIC ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS})
-    target_link_libraries("${name_static}" PRIVATE flang-rt-libcxx-headers flang-rt-libc-headers flang-rt-libc-static)
+    target_link_libraries("${name_static}" PRIVATE flang-rt-libcxx-headers flang-rt-libc-headers flang-rt-libc-static ${extra_deps})
   endif ()
   if (build_shared)
     add_library("${name_shared}" SHARED ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS})
-    target_link_libraries("${name_shared}" PRIVATE flang-rt-libcxx-headers flang-rt-libc-headers flang-rt-libc-shared)
+    target_link_libraries("${name_shared}" PRIVATE flang-rt-libcxx-headers flang-rt-libc-headers flang-rt-libc-shared  ${extra_deps})
     if (Threads_FOUND) 
       target_link_libraries(${name_shared} PUBLIC Threads::Threads)
     endif ()
diff --git a/flang-rt/include/flang-rt/runtime/memory.h b/flang-rt/include/flang-rt/runtime/memory.h
index 93b477afa9814..ba12a60e07c90 100644
--- a/flang-rt/include/flang-rt/runtime/memory.h
+++ b/flang-rt/include/flang-rt/runtime/memory.h
@@ -44,7 +44,8 @@ template <typename A> RT_API_ATTRS void FreeMemoryAndNullify(A *&p) {
 // and does not support array objects with runtime length.
 template <typename A> class OwningPtr {
 public:
-  using pointer_type = A *;
+  using element_type = std::remove_extent_t<A>;
+  using pointer_type = element_type *;
 
   OwningPtr() = default;
   RT_API_ATTRS explicit OwningPtr(pointer_type p) : ptr_(p) {}
@@ -109,7 +110,10 @@ template <typename A> class OwningPtr {
   RT_API_ATTRS pointer_type operator->() const { return get(); }
 
 private:
-  RT_API_ATTRS void delete_ptr(pointer_type p) { FreeMemory(p); }
+  RT_API_ATTRS void delete_ptr(pointer_type p) {
+    p->~element_type();
+    FreeMemory(p);
+  }
   pointer_type ptr_{};
 };
 
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 787d0dbbfb5ca..45dca9e4076d3 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -12,6 +12,9 @@ find_package(Backtrace)
 set(HAVE_BACKTRACE ${Backtrace_FOUND})
 set(BACKTRACE_HEADER ${Backtrace_HEADER})
 
+# Include the RPC utilities from the `libc` project.
+include(FindLibcCommonUtils)
+
 # List of files that are buildable for all devices.
 set(supported_sources
   ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp
@@ -90,6 +93,9 @@ set(host_sources
   time-intrinsic.cpp
   unit-map.cpp
 )
+if (TARGET llvm-libc-common-utilities)
+  list(APPEND host_sources io-api-server.cpp)
+endif()
 
 # Sources that can be compiled directly for the GPU.
 set(gpu_sources
@@ -137,6 +143,7 @@ set(gpu_sources
   reduce.cpp
   reduction.cpp
   temporary-stack.cpp
+  io-api-gpu.cpp
 )
 
 file(GLOB_RECURSE public_headers
diff --git a/flang-rt/lib/runtime/io-api-gpu.cpp b/flang-rt/lib/runtime/io-api-gpu.cpp
new file mode 100644
index 0000000000000..ce79b145251c1
--- /dev/null
+++ b/flang-rt/lib/runtime/io-api-gpu.cpp
@@ -0,0 +1,99 @@
+//===-- lib/runtime/io-api-gpu.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Implements the subset of the I/O statement API needed for basic
+// list-directed output (PRINT *) of intrinsic types for the GPU.
+//
+// The RPC interface forwards each runtime call from the client to the server
+// using a shared buffer. These calls are buffered on the server, so only the
+// return value from 'BeginExternalListOutput' and 'EndIoStatement' are
+// meaningful.
+
+#include "io-api-gpu.h"
+#include "flang/Runtime/io-api.h"
+
+#include <shared/rpc.h>
+#include <shared/rpc_dispatch.h>
+
+namespace Fortran::runtime::io {
+// A weak reference to the RPC client used to submit calls to the server.
+[[gnu::weak, gnu::visibility("protected")]] rpc::Client client asm(
+    "__llvm_rpc_client");
+
+RT_EXT_API_GROUP_BEGIN
+
+Cookie IODEF(BeginExternalListOutput)(
+    ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
+  return rpc::dispatch<BeginExternalListOutput_Opcode>(client,
+      IONAME(BeginExternalListOutput), unitNumber, sourceFile, sourceLine);
+}
+
+enum Iostat IODEF(EndIoStatement)(Cookie cookie) {
+  return rpc::dispatch<EndIoStatement_Opcode>(
+      client, IONAME(EndIoStatement), cookie);
+}
+
+bool IODEF(OutputInteger8)(Cookie cookie, std::int8_t n) {
+  return rpc::dispatch<OutputInteger8_Opcode>(
+      client, IONAME(OutputInteger8), cookie, n);
+}
+
+bool IODEF(OutputInteger16)(Cookie cookie, std::int16_t n) {
+  return rpc::dispatch<OutputInteger16_Opcode>(
+      client, IONAME(OutputInteger16), cookie, n);
+}
+
+bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
+  return rpc::dispatch<OutputInteger32_Opcode>(
+      client, IONAME(OutputInteger32), cookie, n);
+}
+
+bool IODEF(OutputInteger64)(Cookie cookie, std::int64_t n) {
+  return rpc::dispatch<OutputInteger64_Opcode>(
+      client, IONAME(OutputInteger64), cookie, n);
+}
+
+#ifdef __SIZEOF_INT128__
+bool IODEF(OutputInteger128)(Cookie cookie, common::int128_t n) {
+  return rpc::dispatch<OutputInteger128_Opcode>(
+      client, IONAME(OutputInteger128), cookie, n);
+}
+#endif
+
+bool IODEF(OutputReal32)(Cookie cookie, float x) {
+  return rpc::dispatch<OutputReal32_Opcode>(
+      client, IONAME(OutputReal32), cookie, x);
+}
+
+bool IODEF(OutputReal64)(Cookie cookie, double x) {
+  return rpc::dispatch<OutputReal64_Opcode>(
+      client, IONAME(OutputReal64), cookie, x);
+}
+
+bool IODEF(OutputComplex32)(Cookie cookie, float re, float im) {
+  return rpc::dispatch<OutputComplex32_Opcode>(
+      client, IONAME(OutputComplex32), cookie, re, im);
+}
+
+bool IODEF(OutputComplex64)(Cookie cookie, double re, double im) {
+  return rpc::dispatch<OutputComplex64_Opcode>(
+      client, IONAME(OutputComplex64), cookie, re, im);
+}
+
+bool IODEF(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
+  return rpc::dispatch<OutputAscii_Opcode>(
+      client, IONAME(OutputAscii), cookie, x, length);
+}
+
+bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
+  return rpc::dispatch<OutputLogical_Opcode>(
+      client, IONAME(OutputLogical), cookie, truth);
+}
+
+RT_EXT_API_GROUP_END
+} // namespace Fortran::runtime::io
diff --git a/flang-rt/lib/runtime/io-api-gpu.h b/flang-rt/lib/runtime/io-api-gpu.h
new file mode 100644
index 0000000000000..c2b55f40daf36
--- /dev/null
+++ b/flang-rt/lib/runtime/io-api-gpu.h
@@ -0,0 +1,82 @@
+//===-- lib/runtime/io-api-gpu.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FLANG_RT_RUNTIME_IO_API_GPU_H_
+#define FLANG_RT_RUNTIME_IO_API_GPU_H_
+
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include <cstdint>
+#include <utility>
+
+namespace Fortran::runtime::io {
+// We reserve the RPC opcodes with 'f' in the MSB for Fortran usage.
+constexpr std::uint32_t MakeOpcode(std::uint32_t base) {
+  return ('f' << 24) | base;
+}
+
+// Opcodes shared between the client and server for each function we support.
+constexpr std::uint32_t BeginExternalListOutput_Opcode = MakeOpcode(0);
+constexpr std::uint32_t EndIoStatement_Opcode = MakeOpcode(1);
+constexpr std::uint32_t OutputInteger8_Opcode = MakeOpcode(2);
+constexpr std::uint32_t OutputInteger16_Opcode = MakeOpcode(3);
+constexpr std::uint32_t OutputInteger32_Opcode = MakeOpcode(4);
+constexpr std::uint32_t OutputInteger64_Opcode = MakeOpcode(5);
+constexpr std::uint32_t OutputInteger128_Opcode = MakeOpcode(6);
+constexpr std::uint32_t OutputReal32_Opcode = MakeOpcode(7);
+constexpr std::uint32_t OutputReal64_Opcode = MakeOpcode(8);
+constexpr std::uint32_t OutputComplex32_Opcode = MakeOpcode(9);
+constexpr std::uint32_t OutputComplex64_Opcode = MakeOpcode(10);
+constexpr std::uint32_t OutputAscii_Opcode = MakeOpcode(11);
+constexpr std::uint32_t OutputLogical_Opcode = MakeOpcode(12);
+
+// A simple dynamic array that only supports appending to avoid std::vector.
+template <typename T> struct DynamicArray {
+  ~DynamicArray() {
+    for (std::size_t i = 0; i < size_; ++i) {
+      data_[i].~T();
+    }
+    FreeMemory(data_);
+  }
+
+  void emplace_back(T &&value) {
+    if (size_ == capacity_) {
+      reserve(capacity_ ? capacity_ * 2 : 4);
+    }
+    new (data_ + size_) T(std::move(value));
+    ++size_;
+  }
+
+  void reserve(std::size_t newCap) {
+    if (newCap <= capacity_) {
+      return;
+    }
+    T *new_data = static_cast<T *>(
+        AllocateMemoryOrCrash(terminator_, newCap * sizeof(T)));
+    for (std::size_t i = 0; i < size_; ++i) {
+      new (new_data + i) T(std::move(data_[i]));
+      data_[i].~T();
+    }
+    FreeMemory(data_);
+    data_ = new_data;
+    capacity_ = newCap;
+  }
+
+  T *begin() const { return data_; }
+  T *end() const { return data_ + size_; }
+
+private:
+  T *data_ = nullptr;
+  std::size_t size_ = 0;
+  std::size_t capacity_ = 0;
+  Terminator terminator_{__FILE__, __LINE__};
+};
+
+} // namespace Fortran::runtime::io
+
+#endif // FLANG_RT_RUNTIME_IO_API_GPU_H_
diff --git a/flang-rt/lib/runtime/io-api-server.cpp b/flang-rt/lib/runtime/io-api-server.cpp
new file mode 100644
index 0000000000000..451bcae4380f1
--- /dev/null
+++ b/flang-rt/lib/runtime/io-api-server.cpp
@@ -0,0 +1,267 @@
+//===-- lib/runtime/io-api-server.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Implements the RPC server-side handlling of the I/O statement API needed for
+// basic list-directed output (PRINT *) of intrinsic types for the GPU.
+
+#include "io-api-gpu.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang/Runtime/io-api.h"
+#include <cstdlib>
+#include <cstring>
+#include <tuple>
+
+#include <shared/rpc.h>
+#include <shared/rpc_dispatch.h>
+
+namespace Fortran::runtime::io {
+namespace {
+
+// Context used to chain the IO operations once run.
+struct IOContext {
+  Cookie cookie = nullptr;
+  enum Iostat result = IostatOk;
+};
+
+// The base class to store deferred execution of a function. Uses function
+// pointers for type erasure to avoid virtual dispatch.
+struct DeferredFunctionBase {
+  using ExecuteFn = void (*)(void *, IOContext &);
+  using DestroyFn = void (*)(void *);
+
+  DeferredFunctionBase(void *impl, ExecuteFn exec, DestroyFn dtor)
+      : impl_(impl), execute_(exec), destroy_(dtor) {}
+
+  DeferredFunctionBase(const DeferredFunctionBase &) = delete;
+  DeferredFunctionBase &operator=(const DeferredFunctionBase &) = delete;
+  DeferredFunctionBase(DeferredFunctionBase &&other)
+      : impl_(other.impl_), execute_(other.execute_), destroy_(other.destroy_) {
+    other.impl_ = nullptr;
+  }
+  DeferredFunctionBase &operator=(DeferredFunctionBase &&other) {
+    if (this != &other) {
+      reset();
+      impl_ = other.impl_;
+      execute_ = other.execute_;
+      destroy_ = other.destroy_;
+      other.impl_ = nullptr;
+    }
+    return *this;
+  }
+
+  ~DeferredFunctionBase() { reset(); }
+
+  void execute(IOContext &ctx) { execute_(impl_, ctx); }
+
+  static OwningPtr<char[]> TempString(const char *str) {
+    if (!str) {
+      return {};
+    }
+
+    const auto size = std::strlen(str) + 1;
+    OwningPtr<char> temp = SizedNew<char>{Terminator{__FILE__, __LINE__}}(size);
+    std::memcpy(temp.get(), str, size);
+    return OwningPtr<char[]>(temp.release());
+  }
+
+private:
+  void reset() {
+    if (impl_) {
+      destroy_(impl_);
+      FreeMemory(impl_);
+      impl_ = nullptr;
+    }
+  }
+
+  void *impl_ = nullptr;
+  ExecuteFn execute_ = nullptr;
+  DestroyFn destroy_ = nullptr;
+};
+
+// Fortran does not support nested or recursive I/O, which is problematic for
+// parallel execution on a GPU. To support this, we defer execution of runtime
+// functions coming from the GPU's client until the end of that sequence is
+// reached. This allows us to finish them in a single pass.
+template <typename FnTy, typename... Args> struct DeferredFunction {
+  FnTy fn_;
+  std::tuple<std::decay_t<Args>...> args_;
+
+  DeferredFunction(FnTy &&fn, Args &&...args)
+      : fn_(std::forward<FnTy>(fn)), args_(std::forward<Args>(args)...) {}
+
+  // When executing the final command queue we need to replace the temporary
+  // values obtained from the GPU with the returned values from the actual
+  // runtime functions.
+  void execute(IOContext &ctx) {
+    auto caller = [&](auto &&...args) { return fn_(Rewrite(args, ctx)...); };
+
+    using RetTy = std::invoke_result_t<FnTy,
+        decltype(Rewrite(std::declval<Args &>(), ctx))...>;
+    if constexpr (std::is_same_v<RetTy, Cookie>) {
+      ctx.cookie = std::apply(caller, args_);
+    } else if constexpr (std::is_same_v<RetTy, Iostat>) {
+      ctx.result = std::apply(caller, args_);
+    } else {
+      std::apply(caller, args_);
+    }
+  }
+
+private:
+  template <typename T> T &Rewrite(T &v, IOContext &) { return v; }
+
+  const char *Rewrite(OwningPtr<char[]> &p, IOContext &) { return p.get(); }
+
+  Cookie Rewrite(Cookie, IOContext &ctx) { return ctx.cookie; }
+};
+
+template <typename Fn, typename... Args>
+DeferredFunctionBase MakeDeferred(Fn &&fn, Args &&...args) {
+  Terminator terminator{__FILE__, __LINE__};
+  using Ty = DeferredFunction<Fn, Args...>;
+  auto ptr = SizedNew<Ty>{terminator}(
+      sizeof(Ty), std::forward<Fn>(fn), std::forward<Args>(args)...);
+  void *raw = ptr.release();
+  return DeferredFunctionBase(
+      raw,
+      [](void *self, IOContext &ctx) { static_cast<Ty *>(self)->execute(ctx); },
+      [](void *self) { static_cast<Ty *>(self)->~Ty(); });
+}
+
+// The context associated with the queue of deferred functions. This serves as
+// our cookie object while executing this on the GPU.
+struct DeferredContext {
+  IOContext ioCtx;
+  DynamicArray<DeferredFunctionBase> commands;
+};
+
+template <typename FnTy, typename... Args>
+bool EnqueueDeferred(FnTy &&fn, Cookie cookie, Args &&...args) {
+  DeferredContext *ctx = reinterpret_cast<DeferredContext *>(cookie);
+  ctx->commands.emplace_back(
+      MakeDeferred(fn, cookie, std::forward<Args>(args)...));
+  return true;
+}
+
+template <std::uint32_t NumLanes>
+rpc::Status HandleOpcodesImpl(rpc::Server::Port &port) {
+  switch (port.get_opcode()) {
+  case BeginExternalListOutput_Opcode:
+    rpc::invoke<NumLanes>(port,
+        [](ExternalUnit unitNumber, const char *sourceFile,
+            int sourceLine) -> Cookie {
+          DeferredContext *ctx = new (AllocateMemoryOrCrash(
+              Terminator{__FILE__, __LINE__}, sizeof(DeferredContext)))
+              DeferredContext;
+
+          ctx->commands.emplace_back(
+              MakeDeferred(IONAME(BeginExternalListOutput), unitNumber,
+                  DeferredFunctionBase::TempString(sourceFile), sourceLine));
+
+          return reinterpret_cast<Cookie>(ctx);
+        });
+    break;
+  case EndIoStatement_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie) -> Iostat {
+      DeferredContext *ctx = reinterpret_cast<DeferredContext *>(cookie);
+
+      ctx->commands.emplace_back(MakeDeferred(IONAME(EndIoStatement), cookie));
+      for (auto &fn : ctx->commands)
+        fn.execute(ctx->ioCtx);
+      Iostat result = ctx->ioCtx.result;
+
+      ctx->~DeferredContext();
+      FreeMemory(ctx);
+
+      return result;
+    });
+    break;
+  case OutputAscii_Opcode:
+    rpc::invoke<NumLanes>(
+        port, [](Cookie cookie, const char *x, std::size_t length) -> bool {
+          return EnqueueDeferred(IONAME(OutputAscii), cookie,
+              DeferredFunctionBase::TempString(x), length);
+        });
+    break;
+  case OutputInteger8_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, std::int8_t n) -> bool {
+      return EnqueueDeferred(IONAME(OutputInteger8), cookie, n);
+    });
+    break;
+  case OutputInteger16_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, std::int16_t n) -> bool {
+      return EnqueueDeferred(IONAME(OutputInteger16), cookie, n);
+    });
+    break;
+  case OutputInteger32_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, std::int32_t n) -> bool {
+      return EnqueueDeferred(IONAME(OutputInteger32), cookie, n);
+    });
+    break;
+  case OutputInteger64_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, std::int64_t n) -> bool {
+      return EnqueueDeferred(IONAME(OutputInteger64), cookie, n);
+    });
+    break;
+#ifdef __SIZEOF_INT128__
+  case OutputInteger128_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, common::int128_t n) -> bool {
+      return EnqueueDeferred(IONAME(OutputInteger128), cookie, n);
+    });
+    break;
+#endif
+  case OutputReal32_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, float x) -> bool {
+      return EnqueueDeferred(IONAME(OutputReal32), cookie, x);
+    });
+    break;
+  case OutputReal64_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, double x) -> bool {
+      return EnqueueDeferred(IONAME(OutputReal64), cookie, x);
+    });
+    break;
+  case OutputComplex32_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, float re, float im) -> bool {
+      return EnqueueDeferred(IONAME(OutputComplex32), cookie, re, im);
+    });
+    break;
+  case OutputComplex64_Opcode:
+    rpc::invoke<NumLanes>(
+        port, [](Cookie cookie, double re, double im) -> bool {
+          return EnqueueDeferred(IONAME(OutputComplex64), cookie, re, im);
+        });
+    break;
+  case OutputLogical_Opcode:
+    rpc::invoke<NumLanes>(port, [](Cookie cookie, bool truth) -> bool {
+      return EnqueueDeferred(IONAME(OutputLogical), cookie, truth);
+    });
+    break;
+  default:
+    return rpc::RPC_UNHANDLED_OPCODE;
+  }
+
+  return rpc::RPC_SUCCESS;
+}
+} // namespace
+
+RT_EXT_API_GROUP_BEGIN
+std::uint32_t IODEF(HandleRPCOpcodes)(void *raw, std::uint32_t numLanes) {
+  rpc::Server::Port &Port = *reinterpret_cast<rpc::Server::Port *>(raw);
+  if (numLanes == 1) {
+    return HandleOpcodesImpl<1>(Port);
+  }
+  if (numLanes == 32) {
+    return HandleOpcodesImpl<32>(Port);
+  }
+  if (numLanes == 64) {
+    return HandleOpcodesImpl<64>(Port);
+  }
+  return rpc::RPC_ERROR;
+}
+RT_EXT_API_GROUP_END
+} // namespace Fortran::runtime::io
diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 988fe536705e6..fe49af2f61683 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -364,6 +364,8 @@ bool IODECL(InquireInteger64)(
 // rather than by terminating the image.
 enum Iostat IODECL(EndIoStatement)(Cookie);
 
+// Used for I/O from the offloading device.
+std::uint32_t IODECL(HandleRPCOpcodes)(void *raw, std::uint32_t numLanes);
 } // extern "C"
 } // namespace Fortran::runtime::io
 
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index 23000783270f8..d02366ee72c26 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -35,6 +35,10 @@ endif()
 include(FindLibcCommonUtils)
 target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
 
+if (TARGET flang_rt.runtime.static)
+  target_link_libraries(PluginCommon PRIVATE flang_rt.runtime.static)
+endif()
+
 # Define the TARGET_NAME and DEBUG_PREFIX.
 target_compile_definitions(PluginCommon PRIVATE
   TARGET_NAME=PluginInterface
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 8bb60feab7b8d..be0e5babbe276 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -17,6 +17,10 @@
 #include "shared/rpc_opcodes.h"
 #include "shared/rpc_server.h"
 
+#if __has_include("flang/Runtime/io-api.h")
+#include "flang/Runtime/io-api.h"
+#endif
+
 using namespace llvm;
 using namespace omp;
 using namespace target;
@@ -111,6 +115,12 @@ runServer(plugin::GenericDeviceTy &Device, void *Buffer,
   if (Status == rpc::RPC_UNHANDLED_OPCODE)
     Status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*Port, NumLanes);
 
+#if __has_include("flang/Runtime/io-api.h")
+  if (Status == rpc::RPC_UNHANDLED_OPCODE)
+    Status = static_cast<rpc::Status>(
+        Fortran::runtime::io::IONAME(HandleRPCOpcodes)(&*Port, NumLanes));
+#endif
+
   return Status;
 }
 
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 0d5a9c95c1d95..0e0d9abd3d8ff 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -166,8 +166,6 @@ elif config.operating_system == 'Darwin':
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
 else: # Unices
-    if config.libomptarget_current_target != "nvptx64-nvidia-cuda":
-        config.test_flags += " -nogpulib"
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
     config.test_flags += " -Wl,-rpath," + config.llvm_library_intdir
diff --git a/offload/test/offloading/fortran/io.f90 b/offload/test/offloading/fortran/io.f90
new file mode 100644
index 0000000000000..67bf5e915a20a
--- /dev/null
+++ b/offload/test/offloading/fortran/io.f90
@@ -0,0 +1,27 @@
+! REQUIRES: flang, libc
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program io_test
+  implicit none
+
+  integer :: i
+  real :: r
+  complex :: c
+  logical :: l
+
+  i = 42
+  r = 3.14
+  c = (1.0, -1.0)
+  l = .true.
+
+  ! CHECK: Text 42 3.14 (1.,-1.) T
+  ! CHECK: Text 42 3.14 (1.,-1.) T
+  ! CHECK: Text 42 3.14 (1.,-1.) T
+  ! CHECK: Text 42 3.14 (1.,-1.) T
+  !$omp target teams num_teams(4)
+  !$omp parallel num_threads(1)
+    print *, "Text", " ", i, " ", r, " ", c, " ", l
+  !$omp end parallel
+  !$omp end target teams
+
+end program io_test
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 33f6da32fe6ff..aae5bc78a12f7 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -36,7 +36,7 @@ list(INSERT CMAKE_MODULE_PATH 0
 # We order libraries to mirror roughly how they are layered, except that compiler-rt can depend
 # on libc++, so we put it after.
 set(LLVM_DEFAULT_RUNTIMES "libc;libunwind;libcxxabi;libcxx;compiler-rt;libclc;openmp;offload")
-set(LLVM_SUPPORTED_RUNTIMES "${LLVM_DEFAULT_RUNTIMES};llvm-libgcc;flang-rt;libsycl;orc-rt")
+set(LLVM_SUPPORTED_RUNTIMES "libc;libunwind;libcxxabi;libcxx;compiler-rt;libclc;openmp;flang-rt;offload;llvm-libgcc;libsycl;orc-rt")
 set(LLVM_ENABLE_RUNTIMES "" CACHE STRING
   "Semicolon-separated list of runtimes to build, or \"all\" (${LLVM_DEFAULT_RUNTIMES}). Supported runtimes are ${LLVM_SUPPORTED_RUNTIMES}.")
 if(LLVM_ENABLE_RUNTIMES STREQUAL "all" )