[flang-commits] [flang] [flang][runtime] Prepare enabling PRINT of integer32 for device. (PR #86247)

Slava Zakharin via flang-commits flang-commits at lists.llvm.org
Mon Mar 25 14:56:11 PDT 2024


https://github.com/vzakhari updated https://github.com/llvm/llvm-project/pull/86247

>From bbf241f9767b69348fc93ada58b4764c5b3b127e Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Thu, 21 Mar 2024 21:15:58 -0700
Subject: [PATCH 1/4] [flang][runtime] Prepare enabling PRINT of integer32 for
 device.

This commit adds required files into the offload build closure,
which means adding RT_API_ATTRS and other markers.

The implementation does not work for CUDA yet, because of
std::variant,swap,reverse usage. These issues will be resolved
separately (e.g. by using libcudacxx header files).
---
 flang/include/flang/Common/real.h             |   3 +
 flang/include/flang/Common/restorer.h         |  11 +-
 flang/include/flang/Common/uint128.h          |   3 +
 flang/include/flang/Common/visit.h            |   5 +-
 .../flang/Decimal/binary-floating-point.h     |  51 ++-
 flang/include/flang/Decimal/decimal.h         |  53 +--
 flang/include/flang/Runtime/api-attrs.h       |   2 +-
 flang/include/flang/Runtime/io-api.h          |  16 +-
 flang/include/flang/Runtime/iostat.h          |   3 +-
 flang/include/flang/Runtime/memory.h          |  13 +-
 flang/include/flang/Runtime/type-code.h       |   1 -
 flang/runtime/CMakeLists.txt                  |  19 +
 flang/runtime/buffer.cpp                      |   6 +
 flang/runtime/buffer.h                        |  42 +-
 flang/runtime/connection.cpp                  |  15 +-
 flang/runtime/connection.h                    |  29 +-
 flang/runtime/descriptor-io.cpp               |   7 +-
 flang/runtime/descriptor-io.h                 |  36 +-
 flang/runtime/edit-input.cpp                  |  90 ++--
 flang/runtime/edit-input.h                    |  28 +-
 flang/runtime/edit-output.cpp                 |  80 ++--
 flang/runtime/edit-output.h                   |  76 ++--
 flang/runtime/emit-encoded.h                  |   7 +-
 flang/runtime/environment.h                   |   2 +
 flang/runtime/external-unit.cpp               |  11 +-
 flang/runtime/file.cpp                        |  22 +
 flang/runtime/file.h                          |  12 +-
 flang/runtime/format-implementation.h         |  16 +-
 flang/runtime/format.cpp                      |   2 +
 flang/runtime/format.h                        |  39 +-
 flang/runtime/freestanding-tools.h            |  25 +-
 flang/runtime/internal-unit.cpp               |  31 +-
 flang/runtime/internal-unit.h                 |  26 +-
 flang/runtime/io-api.cpp                      |  26 +-
 flang/runtime/io-error.cpp                    |  27 +-
 flang/runtime/io-error.h                      |  36 +-
 flang/runtime/io-stmt.cpp                     |  26 +-
 flang/runtime/io-stmt.h                       | 410 ++++++++++--------
 flang/runtime/iostat.cpp                      |   4 +
 flang/runtime/lock.h                          |  14 +-
 flang/runtime/memory.cpp                      |  12 +-
 flang/runtime/namelist.cpp                    |  15 +-
 flang/runtime/namelist.h                      |   3 +-
 flang/runtime/non-tbp-dio.h                   |   2 +-
 flang/runtime/numeric-templates.h             |   7 -
 flang/runtime/pointer.cpp                     |   2 -
 flang/runtime/pseudo-unit.cpp                 |   8 +-
 flang/runtime/terminator.h                    |   2 +-
 flang/runtime/unit.cpp                        |  25 +-
 flang/runtime/unit.h                          | 203 +++++----
 flang/runtime/utf.cpp                         |   6 +-
 flang/runtime/utf.h                           |  14 +-
 52 files changed, 965 insertions(+), 659 deletions(-)

diff --git a/flang/include/flang/Common/real.h b/flang/include/flang/Common/real.h
index 50aab7d89a597e..9ca58bed2dd7c2 100644
--- a/flang/include/flang/Common/real.h
+++ b/flang/include/flang/Common/real.h
@@ -13,6 +13,7 @@
 // The various representations are distinguished by their binary precisions
 // (number of explicit significand bits and any implicit MSB in the fraction).
 
+#include "flang/Runtime/api-attrs.h"
 #include <cinttypes>
 
 namespace Fortran::common {
@@ -119,6 +120,7 @@ template <int BINARY_PRECISION> class RealDetails {
   }
 
 public:
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr int binaryPrecision{BINARY_PRECISION};
   static constexpr int bits{BitsForBinaryPrecision(binaryPrecision)};
   static constexpr bool isImplicitMSB{binaryPrecision != 64 /*x87*/};
@@ -138,6 +140,7 @@ template <int BINARY_PRECISION> class RealDetails {
 
   static constexpr int maxHexadecimalConversionDigits{
       MaxHexadecimalConversionDigits(binaryPrecision)};
+  RT_OFFLOAD_VAR_GROUP_END
 
   static_assert(binaryPrecision > 0);
   static_assert(exponentBits > 1);
diff --git a/flang/include/flang/Common/restorer.h b/flang/include/flang/Common/restorer.h
index 4d5f5e4e2c818d..36bf11d09bbb99 100644
--- a/flang/include/flang/Common/restorer.h
+++ b/flang/include/flang/Common/restorer.h
@@ -19,11 +19,13 @@
 #ifndef FORTRAN_COMMON_RESTORER_H_
 #define FORTRAN_COMMON_RESTORER_H_
 #include "idioms.h"
+#include "flang/Runtime/api-attrs.h"
 namespace Fortran::common {
 template <typename A> class Restorer {
 public:
-  explicit Restorer(A &p, A original) : p_{p}, original_{std::move(original)} {}
-  ~Restorer() { p_ = std::move(original_); }
+  explicit RT_API_ATTRS Restorer(A &p, A original)
+      : p_{p}, original_{std::move(original)} {}
+  RT_API_ATTRS ~Restorer() { p_ = std::move(original_); }
 
   // Inhibit any recreation of this restorer that would result in two restorers
   // trying to restore the same reference.
@@ -38,13 +40,14 @@ template <typename A> class Restorer {
 };
 
 template <typename A, typename B>
-common::IfNoLvalue<Restorer<A>, B> ScopedSet(A &to, B &&from) {
+RT_API_ATTRS common::IfNoLvalue<Restorer<A>, B> ScopedSet(A &to, B &&from) {
   A original{std::move(to)};
   to = std::move(from);
   return Restorer<A>{to, std::move(original)};
 }
 template <typename A, typename B>
-common::IfNoLvalue<Restorer<A>, B> ScopedSet(A &to, const B &from) {
+RT_API_ATTRS common::IfNoLvalue<Restorer<A>, B> ScopedSet(
+    A &to, const B &from) {
   A original{std::move(to)};
   to = from;
   return Restorer<A>{to, std::move(original)};
diff --git a/flang/include/flang/Common/uint128.h b/flang/include/flang/Common/uint128.h
index 03e44eb6997d5b..55841c0d9b9028 100644
--- a/flang/include/flang/Common/uint128.h
+++ b/flang/include/flang/Common/uint128.h
@@ -20,6 +20,7 @@
 #endif
 
 #include "leading-zero-bit-count.h"
+#include "flang/Runtime/api-attrs.h"
 #include <cstdint>
 #include <type_traits>
 
@@ -260,7 +261,9 @@ template <bool IS_SIGNED = false> class Int128 {
       return LeadingZeroBitCount(high_);
     }
   }
+  RT_VAR_GROUP_BEGIN
   static constexpr std::uint64_t topBit{std::uint64_t{1} << 63};
+  RT_VAR_GROUP_END
 #if FLANG_LITTLE_ENDIAN
   std::uint64_t low_{0}, high_{0};
 #elif FLANG_BIG_ENDIAN
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index f733b726189c88..9d9048c8f4bf10 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -21,6 +21,7 @@
 #ifndef FORTRAN_COMMON_VISIT_H_
 #define FORTRAN_COMMON_VISIT_H_
 
+#include "flang/Runtime/api-attrs.h"
 #include <type_traits>
 #include <variant>
 
@@ -29,7 +30,7 @@ namespace log2visit {
 
 template <std::size_t LOW, std::size_t HIGH, typename RESULT, typename VISITOR,
     typename... VARIANT>
-inline RESULT Log2VisitHelper(
+inline RT_API_ATTRS RESULT Log2VisitHelper(
     VISITOR &&visitor, std::size_t which, VARIANT &&...u) {
   if constexpr (LOW + 7 >= HIGH) {
     switch (which - LOW) {
@@ -61,7 +62,7 @@ inline RESULT Log2VisitHelper(
 }
 
 template <typename VISITOR, typename... VARIANT>
-inline auto visit(VISITOR &&visitor, VARIANT &&...u)
+inline RT_API_ATTRS auto visit(VISITOR &&visitor, VARIANT &&...u)
     -> decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...)) {
   using Result = decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...));
   if constexpr (sizeof...(u) == 1) {
diff --git a/flang/include/flang/Decimal/binary-floating-point.h b/flang/include/flang/Decimal/binary-floating-point.h
index d1992819f85aa6..1c8829550043de 100644
--- a/flang/include/flang/Decimal/binary-floating-point.h
+++ b/flang/include/flang/Decimal/binary-floating-point.h
@@ -14,6 +14,7 @@
 
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
+#include "flang/Runtime/api-attrs.h"
 #include <cinttypes>
 #include <climits>
 #include <cstring>
@@ -47,9 +48,11 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
 
   using RawType = common::HostUnsignedIntType<bits>;
   static_assert(CHAR_BIT * sizeof(RawType) >= bits);
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr RawType significandMask{(RawType{1} << significandBits) - 1};
 
-  constexpr BinaryFloatingPointNumber() {} // zero
+  constexpr RT_API_ATTRS BinaryFloatingPointNumber() {} // zero
+  RT_OFFLOAD_VAR_GROUP_END
   constexpr BinaryFloatingPointNumber(
       const BinaryFloatingPointNumber &that) = default;
   constexpr BinaryFloatingPointNumber(
@@ -58,26 +61,30 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
       const BinaryFloatingPointNumber &that) = default;
   constexpr BinaryFloatingPointNumber &operator=(
       BinaryFloatingPointNumber &&that) = default;
-  constexpr explicit BinaryFloatingPointNumber(RawType raw) : raw_{raw} {}
+  constexpr explicit RT_API_ATTRS BinaryFloatingPointNumber(RawType raw)
+      : raw_{raw} {}
 
-  RawType raw() const { return raw_; }
+  RT_API_ATTRS RawType raw() const { return raw_; }
 
-  template <typename A> explicit constexpr BinaryFloatingPointNumber(A x) {
+  template <typename A>
+  explicit constexpr RT_API_ATTRS BinaryFloatingPointNumber(A x) {
     static_assert(sizeof raw_ <= sizeof x);
     std::memcpy(reinterpret_cast<void *>(&raw_),
         reinterpret_cast<const void *>(&x), sizeof raw_);
   }
 
-  constexpr int BiasedExponent() const {
+  constexpr RT_API_ATTRS int BiasedExponent() const {
     return static_cast<int>(
         (raw_ >> significandBits) & ((1 << exponentBits) - 1));
   }
-  constexpr int UnbiasedExponent() const {
+  constexpr RT_API_ATTRS int UnbiasedExponent() const {
     int biased{BiasedExponent()};
     return biased - exponentBias + (biased == 0);
   }
-  constexpr RawType Significand() const { return raw_ & significandMask; }
-  constexpr RawType Fraction() const {
+  constexpr RT_API_ATTRS RawType Significand() const {
+    return raw_ & significandMask;
+  }
+  constexpr RT_API_ATTRS RawType Fraction() const {
     RawType sig{Significand()};
     if (isImplicitMSB && BiasedExponent() > 0) {
       sig |= RawType{1} << significandBits;
@@ -85,10 +92,10 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
     return sig;
   }
 
-  constexpr bool IsZero() const {
+  constexpr RT_API_ATTRS bool IsZero() const {
     return (raw_ & ((RawType{1} << (bits - 1)) - 1)) == 0;
   }
-  constexpr bool IsNaN() const {
+  constexpr RT_API_ATTRS bool IsNaN() const {
     auto expo{BiasedExponent()};
     auto sig{Significand()};
     if constexpr (bits == 80) { // x87
@@ -102,7 +109,7 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
       return expo == maxExponent && sig != 0;
     }
   }
-  constexpr bool IsInfinite() const {
+  constexpr RT_API_ATTRS bool IsInfinite() const {
     if constexpr (bits == 80) { // x87
       return BiasedExponent() == maxExponent &&
           Significand() == ((significandMask >> 1) + 1);
@@ -110,27 +117,30 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
       return BiasedExponent() == maxExponent && Significand() == 0;
     }
   }
-  constexpr bool IsMaximalFiniteMagnitude() const {
+  constexpr RT_API_ATTRS bool IsMaximalFiniteMagnitude() const {
     return BiasedExponent() == maxExponent - 1 &&
         Significand() == significandMask;
   }
-  constexpr bool IsNegative() const { return ((raw_ >> (bits - 1)) & 1) != 0; }
+  constexpr RT_API_ATTRS bool IsNegative() const {
+    return ((raw_ >> (bits - 1)) & 1) != 0;
+  }
 
-  constexpr void Negate() { raw_ ^= RawType{1} << (bits - 1); }
+  constexpr RT_API_ATTRS void Negate() { raw_ ^= RawType{1} << (bits - 1); }
 
   // For calculating the nearest neighbors of a floating-point value
-  constexpr void Previous() {
+  constexpr RT_API_ATTRS void Previous() {
     RemoveExplicitMSB();
     --raw_;
     InsertExplicitMSB();
   }
-  constexpr void Next() {
+  constexpr RT_API_ATTRS void Next() {
     RemoveExplicitMSB();
     ++raw_;
     InsertExplicitMSB();
   }
 
-  static constexpr BinaryFloatingPointNumber Infinity(bool isNegative) {
+  static constexpr RT_API_ATTRS BinaryFloatingPointNumber Infinity(
+      bool isNegative) {
     RawType result{RawType{maxExponent} << significandBits};
     if (isNegative) {
       result |= RawType{1} << (bits - 1);
@@ -139,7 +149,8 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
   }
 
   // Returns true when the result is exact
-  constexpr bool RoundToBits(int keepBits, enum FortranRounding mode) {
+  constexpr RT_API_ATTRS bool RoundToBits(
+      int keepBits, enum FortranRounding mode) {
     if (IsNaN() || IsInfinite() || keepBits >= binaryPrecision) {
       return true;
     }
@@ -180,12 +191,12 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
   }
 
 private:
-  constexpr void RemoveExplicitMSB() {
+  constexpr RT_API_ATTRS void RemoveExplicitMSB() {
     if constexpr (!isImplicitMSB) {
       raw_ = (raw_ & (significandMask >> 1)) | ((raw_ & ~significandMask) >> 1);
     }
   }
-  constexpr void InsertExplicitMSB() {
+  constexpr RT_API_ATTRS void InsertExplicitMSB() {
     if constexpr (!isImplicitMSB) {
       constexpr RawType mask{significandMask >> 1};
       raw_ = (raw_ & mask) | ((raw_ & ~mask) << 1);
diff --git a/flang/include/flang/Decimal/decimal.h b/flang/include/flang/Decimal/decimal.h
index f0997fb63df018..aeda01c44fa6f6 100644
--- a/flang/include/flang/Decimal/decimal.h
+++ b/flang/include/flang/Decimal/decimal.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_DECIMAL_DECIMAL_H_
 #define FORTRAN_DECIMAL_DECIMAL_H_
 
+#include "flang/Runtime/api-attrs.h"
 #include <stddef.h>
 
 #ifdef __cplusplus
@@ -65,27 +66,27 @@ enum DecimalConversionFlags {
 
 #ifdef __cplusplus
 template <int PREC>
-ConversionToDecimalResult ConvertToDecimal(char *, size_t,
+RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal(char *, size_t,
     DecimalConversionFlags, int digits, enum FortranRounding rounding,
     BinaryFloatingPointNumber<PREC> x);
 
-extern template ConversionToDecimalResult ConvertToDecimal<8>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<8>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<8>);
-extern template ConversionToDecimalResult ConvertToDecimal<11>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<11>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<11>);
-extern template ConversionToDecimalResult ConvertToDecimal<24>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<24>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<24>);
-extern template ConversionToDecimalResult ConvertToDecimal<53>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<53>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<53>);
-extern template ConversionToDecimalResult ConvertToDecimal<64>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<64>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<64>);
-extern template ConversionToDecimalResult ConvertToDecimal<113>(char *, size_t,
-    enum DecimalConversionFlags, int, enum FortranRounding,
+extern template RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal<113>(
+    char *, size_t, enum DecimalConversionFlags, int, enum FortranRounding,
     BinaryFloatingPointNumber<113>);
 
 template <int PREC> struct ConversionToBinaryResult {
@@ -94,20 +95,20 @@ template <int PREC> struct ConversionToBinaryResult {
 };
 
 template <int PREC>
-ConversionToBinaryResult<PREC> ConvertToBinary(const char *&,
+RT_API_ATTRS ConversionToBinaryResult<PREC> ConvertToBinary(const char *&,
     enum FortranRounding = RoundNearest, const char *end = nullptr);
 
-extern template ConversionToBinaryResult<8> ConvertToBinary<8>(
+extern template RT_API_ATTRS ConversionToBinaryResult<8> ConvertToBinary<8>(
     const char *&, enum FortranRounding, const char *end);
-extern template ConversionToBinaryResult<11> ConvertToBinary<11>(
+extern template RT_API_ATTRS ConversionToBinaryResult<11> ConvertToBinary<11>(
     const char *&, enum FortranRounding, const char *end);
-extern template ConversionToBinaryResult<24> ConvertToBinary<24>(
+extern template RT_API_ATTRS ConversionToBinaryResult<24> ConvertToBinary<24>(
     const char *&, enum FortranRounding, const char *end);
-extern template ConversionToBinaryResult<53> ConvertToBinary<53>(
+extern template RT_API_ATTRS ConversionToBinaryResult<53> ConvertToBinary<53>(
     const char *&, enum FortranRounding, const char *end);
-extern template ConversionToBinaryResult<64> ConvertToBinary<64>(
+extern template RT_API_ATTRS ConversionToBinaryResult<64> ConvertToBinary<64>(
     const char *&, enum FortranRounding, const char *end);
-extern template ConversionToBinaryResult<113> ConvertToBinary<113>(
+extern template RT_API_ATTRS ConversionToBinaryResult<113> ConvertToBinary<113>(
     const char *&, enum FortranRounding, const char *end);
 } // namespace Fortran::decimal
 extern "C" {
@@ -116,21 +117,21 @@ extern "C" {
 #define NS(x) x
 #endif /* C++ */
 
-struct NS(ConversionToDecimalResult)
+RT_API_ATTRS struct NS(ConversionToDecimalResult)
     ConvertFloatToDecimal(char *, size_t, enum NS(DecimalConversionFlags),
         int digits, enum NS(FortranRounding), float);
-struct NS(ConversionToDecimalResult)
+RT_API_ATTRS struct NS(ConversionToDecimalResult)
     ConvertDoubleToDecimal(char *, size_t, enum NS(DecimalConversionFlags),
         int digits, enum NS(FortranRounding), double);
-struct NS(ConversionToDecimalResult)
+RT_API_ATTRS struct NS(ConversionToDecimalResult)
     ConvertLongDoubleToDecimal(char *, size_t, enum NS(DecimalConversionFlags),
         int digits, enum NS(FortranRounding), long double);
 
-enum NS(ConversionResultFlags)
+RT_API_ATTRS enum NS(ConversionResultFlags)
     ConvertDecimalToFloat(const char **, float *, enum NS(FortranRounding));
-enum NS(ConversionResultFlags)
+RT_API_ATTRS enum NS(ConversionResultFlags)
     ConvertDecimalToDouble(const char **, double *, enum NS(FortranRounding));
-enum NS(ConversionResultFlags) ConvertDecimalToLongDouble(
+RT_API_ATTRS enum NS(ConversionResultFlags) ConvertDecimalToLongDouble(
     const char **, long double *, enum NS(FortranRounding));
 #undef NS
 #ifdef __cplusplus
diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Runtime/api-attrs.h
index fc3eb42e1b73f5..050d2366b8e165 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Runtime/api-attrs.h
@@ -102,7 +102,7 @@
  * to appear as part of a C++ decl-specifier.
  */
 #ifndef RT_CONST_VAR_ATTRS
-#if defined(__CUDACC__) || defined(__CUDA__)
+#if (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
 #define RT_CONST_VAR_ATTRS __constant__
 #else
 #define RT_CONST_VAR_ATTRS
diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 556cc20c5a121e..1b6c4f5d6a65ca 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -51,13 +51,21 @@ constexpr InquiryKeywordHash HashInquiryKeyword(const char *p) {
   return hash;
 }
 
-const char *InquiryKeywordHashDecode(
+RT_API_ATTRS const char *InquiryKeywordHashDecode(
     char *buffer, std::size_t, InquiryKeywordHash);
 
 extern "C" {
 
 #define IONAME(name) RTNAME(io##name)
 
+#ifndef IODECL
+#define IODECL(name) RT_API_ATTRS IONAME(name)
+#endif
+
+#ifndef IODEF
+#define IODEF(name) RT_API_ATTRS IONAME(name)
+#endif
+
 // These functions initiate data transfer statements (READ, WRITE, PRINT).
 // Example: PRINT *, 666 is implemented as the series of calls:
 //   Cookie cookie{BeginExternalListOutput(DefaultOutputUnit,
@@ -139,7 +147,7 @@ enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // External synchronous I/O initiation
-Cookie IONAME(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
+Cookie IODECL(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
@@ -253,7 +261,7 @@ bool IONAME(InputDescriptor)(Cookie, const Descriptor &);
 // Formatted (including list directed) I/O data items
 bool IONAME(OutputInteger8)(Cookie, std::int8_t);
 bool IONAME(OutputInteger16)(Cookie, std::int16_t);
-bool IONAME(OutputInteger32)(Cookie, std::int32_t);
+bool IODECL(OutputInteger32)(Cookie, std::int32_t);
 bool IONAME(OutputInteger64)(Cookie, std::int64_t);
 bool IONAME(OutputInteger128)(Cookie, common::int128_t);
 bool IONAME(InputInteger)(Cookie, std::int64_t &, int kind = 8);
@@ -357,7 +365,7 @@ bool IONAME(InquireInteger64)(
 // returned is guaranteed to only be one of the problems that the
 // EnableHandlers() call has indicated should be handled in compiled code
 // rather than by terminating the image.
-enum Iostat IONAME(EndIoStatement)(Cookie);
+enum Iostat IODECL(EndIoStatement)(Cookie);
 
 } // extern "C"
 } // namespace Fortran::runtime::io
diff --git a/flang/include/flang/Runtime/iostat.h b/flang/include/flang/Runtime/iostat.h
index afce509cf1f564..c3ec8cae858163 100644
--- a/flang/include/flang/Runtime/iostat.h
+++ b/flang/include/flang/Runtime/iostat.h
@@ -11,6 +11,7 @@
 
 #ifndef FORTRAN_RUNTIME_IOSTAT_H_
 #define FORTRAN_RUNTIME_IOSTAT_H_
+#include "flang/Runtime/api-attrs.h"
 #include "flang/Runtime/magic-numbers.h"
 namespace Fortran::runtime::io {
 
@@ -88,7 +89,7 @@ enum Iostat {
   IostatNonExternalDefinedUnformattedIo,
 };
 
-const char *IostatErrorString(int);
+RT_API_ATTRS const char *IostatErrorString(int);
 
 } // namespace Fortran::runtime::io
 #endif // FORTRAN_RUNTIME_IOSTAT_H_
diff --git a/flang/include/flang/Runtime/memory.h b/flang/include/flang/Runtime/memory.h
index e24c509f4e90cb..0f2e7c3904f580 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang/include/flang/Runtime/memory.h
@@ -79,6 +79,8 @@ template <typename A> class OwningPtr {
     return p;
   }
 
+  RT_DIAG_PUSH
+  RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
   // Replace the pointer.
   RT_API_ATTRS void reset(pointer_type p = pointer_type{}) {
     std::swap(ptr_, p);
@@ -90,6 +92,7 @@ template <typename A> class OwningPtr {
 
   // Exchange the pointer with another object.
   RT_API_ATTRS void swap(OwningPtr &other) { std::swap(ptr_, other.ptr_); }
+  RT_DIAG_POP
 
   // Get the stored pointer.
   RT_API_ATTRS pointer_type get() const { return ptr_; }
@@ -128,9 +131,12 @@ inline RT_API_ATTRS bool operator!=(std::nullptr_t, const OwningPtr<X> &x) {
 
 template <typename A> class SizedNew {
 public:
-  explicit SizedNew(const Terminator &terminator) : terminator_{terminator} {}
+  explicit RT_API_ATTRS SizedNew(const Terminator &terminator)
+      : terminator_{terminator} {}
+
   template <typename... X>
-  [[nodiscard]] OwningPtr<A> operator()(std::size_t bytes, X &&...x) {
+  [[nodiscard]] RT_API_ATTRS OwningPtr<A> operator()(
+      std::size_t bytes, X &&...x) {
     return OwningPtr<A>{new (AllocateMemoryOrCrash(terminator_, bytes))
             A{std::forward<X>(x)...}};
   }
@@ -141,7 +147,8 @@ template <typename A> class SizedNew {
 
 template <typename A> struct New : public SizedNew<A> {
   using SizedNew<A>::SizedNew;
-  template <typename... X> [[nodiscard]] OwningPtr<A> operator()(X &&...x) {
+  template <typename... X>
+  [[nodiscard]] RT_API_ATTRS OwningPtr<A> operator()(X &&...x) {
     return SizedNew<A>::operator()(sizeof(A), std::forward<X>(x)...);
   }
 };
diff --git a/flang/include/flang/Runtime/type-code.h b/flang/include/flang/Runtime/type-code.h
index f7419249c2ba9c..8e7314e0af1efc 100644
--- a/flang/include/flang/Runtime/type-code.h
+++ b/flang/include/flang/Runtime/type-code.h
@@ -12,7 +12,6 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Common/optional.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
-#include <optional>
 #include <utility>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 02147487115497..d002f68f0ab8e6 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -180,22 +180,38 @@ set(supported_files
   allocatable.cpp
   array-constructor.cpp
   assign.cpp
+  buffer.cpp
   character.cpp
+  connection.cpp
   copy.cpp
   derived-api.cpp
   derived.cpp
   descriptor.cpp
+  descriptor-io.cpp
   dot-product.cpp
+  edit-input.cpp
+  edit-output.cpp
+  environment.cpp
   extrema.cpp
+  external-unit.cpp
   findloc.cpp
+  format.cpp
   inquiry.cpp
+  internal-unit.cpp
+  io-api.cpp
+  io-error.cpp
+  io-stmt.cpp
+  iostat.cpp
   matmul-transpose.cpp
   matmul.cpp
   memory.cpp
   misc-intrinsic.cpp
+  namelist.cpp
+  non-tbp-dio.cpp
   numeric.cpp
   pointer.cpp
   product.cpp
+  pseudo-unit.cpp
   ragged.cpp
   stat.cpp
   sum.cpp
@@ -205,6 +221,8 @@ set(supported_files
   transformational.cpp
   type-code.cpp
   type-info.cpp
+  unit.cpp
+  utf.cpp
   )
 
 if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
@@ -236,6 +254,7 @@ if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
       #   'long double' is treated as 'double' in device code
       -Xcudafe --diag_suppress=20208
       -Xcudafe --display_error_number
+      -G -g
       )
   endif()
   set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
diff --git a/flang/runtime/buffer.cpp b/flang/runtime/buffer.cpp
index 15c83bfd249232..7b4869d69c2e51 100644
--- a/flang/runtime/buffer.cpp
+++ b/flang/runtime/buffer.cpp
@@ -10,14 +10,20 @@
 #include <algorithm>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 // Here's a very old trick for shifting circular buffer data cheaply
 // without a need for a temporary array.
 void LeftShiftBufferCircularly(
     char *buffer, std::size_t bytes, std::size_t shift) {
   // Assume that we start with "efgabcd" and the left shift is 3.
+  RT_DIAG_PUSH
+  RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
   std::reverse(buffer, buffer + shift); // "gfeabcd"
   std::reverse(buffer, buffer + bytes); // "dcbaefg"
   std::reverse(buffer, buffer + bytes - shift); // "abcdefg"
+  RT_DIAG_POP
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/buffer.h b/flang/runtime/buffer.h
index 93fda36f500d31..ca1baea12efafa 100644
--- a/flang/runtime/buffer.h
+++ b/flang/runtime/buffer.h
@@ -11,6 +11,7 @@
 #ifndef FORTRAN_RUNTIME_BUFFER_H_
 #define FORTRAN_RUNTIME_BUFFER_H_
 
+#include "freestanding-tools.h"
 #include "io-error.h"
 #include "flang/Runtime/memory.h"
 #include <algorithm>
@@ -19,7 +20,8 @@
 
 namespace Fortran::runtime::io {
 
-void LeftShiftBufferCircularly(char *, std::size_t bytes, std::size_t shift);
+RT_API_ATTRS void LeftShiftBufferCircularly(
+    char *, std::size_t bytes, std::size_t shift);
 
 // Maintains a view of a contiguous region of a file in a memory buffer.
 // The valid data in the buffer may be circular, but any active frame
@@ -48,22 +50,24 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
 public:
   using FileOffset = std::int64_t;
 
-  ~FileFrame() { FreeMemoryAndNullify(buffer_); }
+  RT_API_ATTRS ~FileFrame() { FreeMemoryAndNullify(buffer_); }
 
   // The valid data in the buffer begins at buffer_[start_] and proceeds
   // with possible wrap-around for length_ bytes.  The current frame
   // is offset by frame_ bytes into that region and is guaranteed to
   // be contiguous for at least as many bytes as were requested.
 
-  FileOffset FrameAt() const { return fileOffset_ + frame_; }
-  char *Frame() const { return buffer_ + start_ + frame_; }
-  std::size_t FrameLength() const {
+  RT_API_ATTRS FileOffset FrameAt() const { return fileOffset_ + frame_; }
+  RT_API_ATTRS char *Frame() const { return buffer_ + start_ + frame_; }
+  RT_API_ATTRS std::size_t FrameLength() const {
     return std::min<std::size_t>(length_ - frame_, size_ - (start_ + frame_));
   }
-  std::size_t BytesBufferedBeforeFrame() const { return frame_ - start_; }
+  RT_API_ATTRS std::size_t BytesBufferedBeforeFrame() const {
+    return frame_ - start_;
+  }
 
   // Returns a short frame at a non-fatal EOF.  Can return a long frame as well.
-  std::size_t ReadFrame(
+  RT_API_ATTRS std::size_t ReadFrame(
       FileOffset at, std::size_t bytes, IoErrorHandler &handler) {
     Flush(handler);
     Reallocate(bytes, handler);
@@ -92,7 +96,8 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
     return FrameLength();
   }
 
-  void WriteFrame(FileOffset at, std::size_t bytes, IoErrorHandler &handler) {
+  RT_API_ATTRS void WriteFrame(
+      FileOffset at, std::size_t bytes, IoErrorHandler &handler) {
     Reallocate(bytes, handler);
     std::int64_t newFrame{at - fileOffset_};
     if (!dirty_ || newFrame < 0 || newFrame > length_) {
@@ -110,7 +115,7 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
     length_ = std::max<std::int64_t>(length_, frame_ + bytes);
   }
 
-  void Flush(IoErrorHandler &handler, std::int64_t keep = 0) {
+  RT_API_ATTRS void Flush(IoErrorHandler &handler, std::int64_t keep = 0) {
     if (dirty_) {
       while (length_ > keep) {
         std::size_t chunk{
@@ -128,7 +133,7 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
     }
   }
 
-  void TruncateFrame(std::int64_t at, IoErrorHandler &handler) {
+  RT_API_ATTRS void TruncateFrame(std::int64_t at, IoErrorHandler &handler) {
     RUNTIME_CHECK(handler, !dirty_);
     if (at <= fileOffset_) {
       Reset(at);
@@ -138,9 +143,10 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
   }
 
 private:
-  STORE &Store() { return static_cast<STORE &>(*this); }
+  RT_API_ATTRS STORE &Store() { return static_cast<STORE &>(*this); }
 
-  void Reallocate(std::int64_t bytes, const Terminator &terminator) {
+  RT_API_ATTRS void Reallocate(
+      std::int64_t bytes, const Terminator &terminator) {
     if (bytes > size_) {
       char *old{buffer_};
       auto oldSize{size_};
@@ -160,13 +166,14 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
     }
   }
 
-  void Reset(FileOffset at) {
+  RT_API_ATTRS void Reset(FileOffset at) {
     start_ = length_ = frame_ = 0;
     fileOffset_ = at;
     dirty_ = false;
   }
 
-  void DiscardLeadingBytes(std::int64_t n, const Terminator &terminator) {
+  RT_API_ATTRS void DiscardLeadingBytes(
+      std::int64_t n, const Terminator &terminator) {
     RUNTIME_CHECK(terminator, length_ >= n);
     length_ -= n;
     if (length_ == 0) {
@@ -185,19 +192,20 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
     fileOffset_ += n;
   }
 
-  void MakeDataContiguous(IoErrorHandler &handler, std::size_t bytes) {
+  RT_API_ATTRS void MakeDataContiguous(
+      IoErrorHandler &handler, std::size_t bytes) {
     if (static_cast<std::int64_t>(start_ + bytes) > size_) {
       // Frame would wrap around; shift current data (if any) to force
       // contiguity.
       RUNTIME_CHECK(handler, length_ < size_);
       if (start_ + length_ <= size_) {
         // [......abcde..] -> [abcde........]
-        std::memmove(buffer_, buffer_ + start_, length_);
+        runtime::memmove(buffer_, buffer_ + start_, length_);
       } else {
         // [cde........ab] -> [abcde........]
         auto n{start_ + length_ - size_}; // 3 for cde
         RUNTIME_CHECK(handler, length_ >= n);
-        std::memmove(buffer_ + n, buffer_ + start_, length_ - n); // cdeab
+        runtime::memmove(buffer_ + n, buffer_ + start_, length_ - n); // cdeab
         LeftShiftBufferCircularly(buffer_, length_, n); // abcde
       }
       start_ = 0;
diff --git a/flang/runtime/connection.cpp b/flang/runtime/connection.cpp
index 91ac9a0e14e47b..f24f0e832eb484 100644
--- a/flang/runtime/connection.cpp
+++ b/flang/runtime/connection.cpp
@@ -12,30 +12,31 @@
 #include <algorithm>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
-std::size_t ConnectionState::RemainingSpaceInRecord() const {
+RT_API_ATTRS std::size_t ConnectionState::RemainingSpaceInRecord() const {
   auto recl{recordLength.value_or(openRecl.value_or(
       executionEnvironment.listDirectedOutputLineLengthLimit))};
   return positionInRecord >= recl ? 0 : recl - positionInRecord;
 }
 
-bool ConnectionState::NeedAdvance(std::size_t width) const {
+RT_API_ATTRS bool ConnectionState::NeedAdvance(std::size_t width) const {
   return positionInRecord > 0 && width > RemainingSpaceInRecord();
 }
 
-bool ConnectionState::IsAtEOF() const {
+RT_API_ATTRS bool ConnectionState::IsAtEOF() const {
   return endfileRecordNumber && currentRecordNumber >= *endfileRecordNumber;
 }
 
-bool ConnectionState::IsAfterEndfile() const {
+RT_API_ATTRS bool ConnectionState::IsAfterEndfile() const {
   return endfileRecordNumber && currentRecordNumber > *endfileRecordNumber;
 }
 
-void ConnectionState::HandleAbsolutePosition(std::int64_t n) {
+RT_API_ATTRS void ConnectionState::HandleAbsolutePosition(std::int64_t n) {
   positionInRecord = std::max(n, std::int64_t{0}) + leftTabLimit.value_or(0);
 }
 
-void ConnectionState::HandleRelativePosition(std::int64_t n) {
+RT_API_ATTRS void ConnectionState::HandleRelativePosition(std::int64_t n) {
   positionInRecord = std::max(leftTabLimit.value_or(0), positionInRecord + n);
 }
 
@@ -57,4 +58,6 @@ SavedPosition::~SavedPosition() {
     conn.pinnedFrame = saved_.pinnedFrame;
   }
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index c41970d47e7b09..6f1ea90a160e5e 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -31,12 +31,12 @@ struct ConnectionAttributes {
   unsigned char internalIoCharKind{0}; // 0->external, 1/2/4->internal
   Fortran::common::optional<std::int64_t> openRecl; // RECL= on OPEN
 
-  bool IsRecordFile() const {
+  RT_API_ATTRS bool IsRecordFile() const {
     // Formatted stream files are viewed as having records, at least on input
     return access != Access::Stream || !isUnformatted.value_or(true);
   }
 
-  template <typename CHAR = char> constexpr bool useUTF8() const {
+  template <typename CHAR = char> constexpr RT_API_ATTRS bool useUTF8() const {
     // For wide CHARACTER kinds, always use UTF-8 for formatted I/O.
     // For single-byte CHARACTER, encode characters >= 0x80 with
     // UTF-8 iff the mode is set.
@@ -45,25 +45,28 @@ struct ConnectionAttributes {
 };
 
 struct ConnectionState : public ConnectionAttributes {
-  bool IsAtEOF() const; // true when read has hit EOF or endfile record
-  bool IsAfterEndfile() const; // true after ENDFILE until repositioned
+  RT_API_ATTRS bool
+  IsAtEOF() const; // true when read has hit EOF or endfile record
+  RT_API_ATTRS bool
+  IsAfterEndfile() const; // true after ENDFILE until repositioned
 
   // All positions and measurements are always in units of bytes,
   // not characters.  Multi-byte character encodings are possible in
   // both internal I/O (when the character kind of the variable is 2 or 4)
   // and external formatted I/O (when the encoding is UTF-8).
-  std::size_t RemainingSpaceInRecord() const;
-  bool NeedAdvance(std::size_t) const;
-  void HandleAbsolutePosition(std::int64_t);
-  void HandleRelativePosition(std::int64_t);
+  RT_API_ATTRS std::size_t RemainingSpaceInRecord() const;
+  RT_API_ATTRS bool NeedAdvance(std::size_t) const;
+  RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t);
 
-  void BeginRecord() {
+  RT_API_ATTRS void BeginRecord() {
     positionInRecord = 0;
     furthestPositionInRecord = 0;
     unterminatedRecord = false;
   }
 
-  Fortran::common::optional<std::int64_t> EffectiveRecordLength() const {
+  RT_API_ATTRS Fortran::common::optional<std::int64_t>
+  EffectiveRecordLength() const {
     // When an input record is longer than an explicit RECL= from OPEN
     // it is effectively truncated on input.
     return openRecl && recordLength && *openRecl < *recordLength ? openRecl
@@ -110,9 +113,9 @@ struct ConnectionState : public ConnectionAttributes {
 // Utility class for capturing and restoring a position in an input stream.
 class SavedPosition {
 public:
-  explicit SavedPosition(IoStatementState &);
-  ~SavedPosition();
-  void Cancel() { cancelled_ = true; }
+  explicit RT_API_ATTRS SavedPosition(IoStatementState &);
+  RT_API_ATTRS ~SavedPosition();
+  RT_API_ATTRS void Cancel() { cancelled_ = true; }
 
 private:
   IoStatementState &io_;
diff --git a/flang/runtime/descriptor-io.cpp b/flang/runtime/descriptor-io.cpp
index 7c7323b719adf8..93df51cf22d3f4 100644
--- a/flang/runtime/descriptor-io.cpp
+++ b/flang/runtime/descriptor-io.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "freestanding-tools.h"
 #include "flang/Common/restorer.h"
 
 namespace Fortran::runtime::io::descr {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 // Defined formatted I/O (maybe)
 Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
@@ -32,9 +34,9 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
       ioType[1] = 'T';
       std::memcpy(ioType + 2, edit.ioType, edit.ioTypeChars);
     } else {
-      std::strcpy(
+      runtime::strcpy(
           ioType, io.mutableModes().inNamelist ? "NAMELIST" : "LISTDIRECTED");
-      ioTypeLen = std::strlen(ioType);
+      ioTypeLen = runtime::strlen(ioType);
     }
     StaticDescriptor<1, true> vListStatDesc;
     Descriptor &vListDesc{vListStatDesc.descriptor()};
@@ -150,4 +152,5 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
index b6b0fefcff870b..7063858d619196 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -28,8 +28,8 @@
 
 namespace Fortran::runtime::io::descr {
 template <typename A>
-inline A &ExtractElement(IoStatementState &io, const Descriptor &descriptor,
-    const SubscriptValue subscripts[]) {
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
   A *p{descriptor.Element<A>(subscripts)};
   if (!p) {
     io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
@@ -45,7 +45,7 @@ inline A &ExtractElement(IoStatementState &io, const Descriptor &descriptor,
 // NAMELIST array output.
 
 template <int KIND, Direction DIR>
-inline bool FormattedIntegerIO(
+inline RT_API_ATTRS bool FormattedIntegerIO(
     IoStatementState &io, const Descriptor &descriptor) {
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
@@ -78,7 +78,7 @@ inline bool FormattedIntegerIO(
 }
 
 template <int KIND, Direction DIR>
-inline bool FormattedRealIO(
+inline RT_API_ATTRS bool FormattedRealIO(
     IoStatementState &io, const Descriptor &descriptor) {
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
@@ -111,7 +111,7 @@ inline bool FormattedRealIO(
 }
 
 template <int KIND, Direction DIR>
-inline bool FormattedComplexIO(
+inline RT_API_ATTRS bool FormattedComplexIO(
     IoStatementState &io, const Descriptor &descriptor) {
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
@@ -159,7 +159,7 @@ inline bool FormattedComplexIO(
 }
 
 template <typename A, Direction DIR>
-inline bool FormattedCharacterIO(
+inline RT_API_ATTRS bool FormattedCharacterIO(
     IoStatementState &io, const Descriptor &descriptor) {
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
@@ -199,7 +199,7 @@ inline bool FormattedCharacterIO(
 }
 
 template <int KIND, Direction DIR>
-inline bool FormattedLogicalIO(
+inline RT_API_ATTRS bool FormattedLogicalIO(
     IoStatementState &io, const Descriptor &descriptor) {
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
@@ -241,12 +241,12 @@ inline bool FormattedLogicalIO(
 }
 
 template <Direction DIR>
-static bool DescriptorIO(IoStatementState &, const Descriptor &,
+static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
 // For intrinsic (not defined) derived type I/O, formatted & unformatted
 template <Direction DIR>
-static bool DefaultComponentIO(IoStatementState &io,
+static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
     const typeInfo::Component &component, const Descriptor &origDescriptor,
     const SubscriptValue origSubscripts[], Terminator &terminator,
     const NonTbpDefinedIoTable *table) {
@@ -269,7 +269,7 @@ static bool DefaultComponentIO(IoStatementState &io,
 }
 
 template <Direction DIR>
-static bool DefaultComponentwiseFormattedIO(IoStatementState &io,
+static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
     const Descriptor &descriptor, const typeInfo::DerivedType &type,
     const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -295,7 +295,7 @@ static bool DefaultComponentwiseFormattedIO(IoStatementState &io,
 }
 
 template <Direction DIR>
-static bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
+static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
     const Descriptor &descriptor, const typeInfo::DerivedType &type,
     const NonTbpDefinedIoTable *table) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -322,12 +322,12 @@ static bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
   return true;
 }
 
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &,
-    const Descriptor &, const typeInfo::DerivedType &,
+RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
     const typeInfo::SpecialBinding &, const SubscriptValue[]);
 
 template <Direction DIR>
-static bool FormattedDerivedTypeIO(IoStatementState &io,
+static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
     const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   // Derived type information must be present for formatted I/O.
@@ -385,12 +385,12 @@ static bool FormattedDerivedTypeIO(IoStatementState &io,
   return true;
 }
 
-bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
     const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
 
 // Unformatted I/O
 template <Direction DIR>
-static bool UnformattedDescriptorIO(IoStatementState &io,
+static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
     const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   const DescriptorAddendum *addendum{descriptor.Addendum()};
@@ -488,8 +488,8 @@ static bool UnformattedDescriptorIO(IoStatementState &io,
 }
 
 template <Direction DIR>
-static bool DescriptorIO(IoStatementState &io, const Descriptor &descriptor,
-    const NonTbpDefinedIoTable *table) {
+static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (handler.InError()) {
     return false;
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index fbeb1a595b327e..935b7c299b2564 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "edit-input.h"
+#include "freestanding-tools.h"
 #include "namelist.h"
 #include "utf.h"
 #include "flang/Common/optional.h"
@@ -16,17 +17,19 @@
 #include <cfenv>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 // Checks that a list-directed input value has been entirely consumed and
 // doesn't contain unparsed characters before the next value separator.
-static inline bool IsCharValueSeparator(const DataEdit &edit, char32_t ch) {
+static inline RT_API_ATTRS bool IsCharValueSeparator(
+    const DataEdit &edit, char32_t ch) {
   char32_t comma{
       edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}};
   return ch == ' ' || ch == '\t' || ch == comma || ch == '/' ||
       (edit.IsNamelist() && (ch == '&' || ch == '$'));
 }
 
-static bool CheckCompleteListDirectedField(
+static RT_API_ATTRS bool CheckCompleteListDirectedField(
     IoStatementState &io, const DataEdit &edit) {
   if (edit.IsListDirected()) {
     std::size_t byteCount;
@@ -52,7 +55,7 @@ static bool CheckCompleteListDirectedField(
 }
 
 template <int LOG2_BASE>
-static bool EditBOZInput(
+static RT_API_ATTRS bool EditBOZInput(
     IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) {
   // Skip leading white space & zeroes
   Fortran::common::optional<int> remaining{io.CueUpInput(edit)};
@@ -151,13 +154,13 @@ static bool EditBOZInput(
   return CheckCompleteListDirectedField(io, edit);
 }
 
-static inline char32_t GetRadixPointChar(const DataEdit &edit) {
+static inline RT_API_ATTRS char32_t GetRadixPointChar(const DataEdit &edit) {
   return edit.modes.editingFlags & decimalComma ? char32_t{','} : char32_t{'.'};
 }
 
 // Prepares input from a field, and returns the sign, if any, else '\0'.
-static char ScanNumericPrefix(IoStatementState &io, const DataEdit &edit,
-    Fortran::common::optional<char32_t> &next,
+static RT_API_ATTRS char ScanNumericPrefix(IoStatementState &io,
+    const DataEdit &edit, Fortran::common::optional<char32_t> &next,
     Fortran::common::optional<int> &remaining) {
   remaining = io.CueUpInput(edit);
   next = io.NextInField(remaining, edit);
@@ -174,7 +177,7 @@ static char ScanNumericPrefix(IoStatementState &io, const DataEdit &edit,
   return sign;
 }
 
-bool EditIntegerInput(
+RT_API_ATTRS bool EditIntegerInput(
     IoStatementState &io, const DataEdit &edit, void *n, int kind) {
   RUNTIME_CHECK(io.GetIoErrorHandler(), kind >= 1 && !(kind & (kind - 1)));
   switch (edit.descriptor) {
@@ -279,18 +282,20 @@ struct ScannedRealInput {
   int exponent{0}; // adjusted as necessary; binary if isHexadecimal
   bool isHexadecimal{false}; // 0X...
 };
-static ScannedRealInput ScanRealInput(
+static RT_API_ATTRS ScannedRealInput ScanRealInput(
     char *buffer, int bufferSize, IoStatementState &io, const DataEdit &edit) {
   Fortran::common::optional<int> remaining;
   Fortran::common::optional<char32_t> next;
   int got{0};
   Fortran::common::optional<int> radixPointOffset;
-  auto Put{[&](char ch) -> void {
+  // The following lambda definition violates the conding style,
+  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
+  auto Put = [&](char ch) -> void {
     if (got < bufferSize) {
       buffer[got] = ch;
     }
     ++got;
-  }};
+  };
   char sign{ScanNumericPrefix(io, edit, next, remaining)};
   if (sign == '-') {
     Put('-');
@@ -487,13 +492,21 @@ static ScannedRealInput ScanRealInput(
   return {got, exponent, isHexadecimal};
 }
 
-static void RaiseFPExceptions(decimal::ConversionResultFlags flags) {
+static RT_API_ATTRS void RaiseFPExceptions(
+    decimal::ConversionResultFlags flags) {
 #undef RAISE
+#if defined(RT_DEVICE_COMPILATION)
+  Terminator terminator(__FILE__, __LINE__);
+#define RAISE(e) \
+  terminator.Crash( \
+      "not implemented yet: raising FP exception in device code: %s", #e);
+#else // !defined(RT_DEVICE_COMPILATION)
 #ifdef feraisexcept // a macro in some environments; omit std::
 #define RAISE feraiseexcept
 #else
 #define RAISE std::feraiseexcept
 #endif
+#endif // !defined(RT_DEVICE_COMPILATION)
   if (flags & decimal::ConversionResultFlags::Overflow) {
     RAISE(FE_OVERFLOW);
   }
@@ -514,7 +527,7 @@ static void RaiseFPExceptions(decimal::ConversionResultFlags flags) {
 // converter without modification, this fast path for real input
 // saves time by avoiding memory copies and reformatting of the exponent.
 template <int PRECISION>
-static bool TryFastPathRealDecimalInput(
+static RT_API_ATTRS bool TryFastPathRealDecimalInput(
     IoStatementState &io, const DataEdit &edit, void *n) {
   if (edit.modes.editingFlags & (blankZero | decimalComma)) {
     return false;
@@ -586,7 +599,8 @@ static bool TryFastPathRealDecimalInput(
 }
 
 template <int binaryPrecision>
-decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
+RT_API_ATTRS decimal::ConversionToBinaryResult<binaryPrecision>
+ConvertHexadecimal(
     const char *&p, enum decimal::FortranRounding rounding, int expo) {
   using RealType = decimal::BinaryFloatingPointNumber<binaryPrecision>;
   using RawType = typename RealType::RawType;
@@ -702,7 +716,8 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
 }
 
 template <int KIND>
-bool EditCommonRealInput(IoStatementState &io, const DataEdit &edit, void *n) {
+RT_API_ATTRS bool EditCommonRealInput(
+    IoStatementState &io, const DataEdit &edit, void *n) {
   constexpr int binaryPrecision{common::PrecisionOfRealKind(KIND)};
   if (TryFastPathRealDecimalInput<binaryPrecision>(io, edit, n)) {
     return CheckCompleteListDirectedField(io, edit);
@@ -798,7 +813,8 @@ bool EditCommonRealInput(IoStatementState &io, const DataEdit &edit, void *n) {
 }
 
 template <int KIND>
-bool EditRealInput(IoStatementState &io, const DataEdit &edit, void *n) {
+RT_API_ATTRS bool EditRealInput(
+    IoStatementState &io, const DataEdit &edit, void *n) {
   switch (edit.descriptor) {
   case DataEdit::ListDirected:
     if (IsNamelistNameOrSlash(io)) {
@@ -832,7 +848,8 @@ bool EditRealInput(IoStatementState &io, const DataEdit &edit, void *n) {
 }
 
 // 13.7.3 in Fortran 2018
-bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) {
+RT_API_ATTRS bool EditLogicalInput(
+    IoStatementState &io, const DataEdit &edit, bool &x) {
   switch (edit.descriptor) {
   case DataEdit::ListDirected:
     if (IsNamelistNameOrSlash(io)) {
@@ -882,7 +899,7 @@ bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) {
 
 // See 13.10.3.1 paragraphs 7-9 in Fortran 2018
 template <typename CHAR>
-static bool EditDelimitedCharacterInput(
+static RT_API_ATTRS bool EditDelimitedCharacterInput(
     IoStatementState &io, CHAR *x, std::size_t length, char32_t delimiter) {
   bool result{true};
   while (true) {
@@ -911,12 +928,12 @@ static bool EditDelimitedCharacterInput(
       --length;
     }
   }
-  std::fill_n(x, length, ' ');
+  Fortran::runtime::fill_n(x, length, ' ');
   return result;
 }
 
 template <typename CHAR>
-static bool EditListDirectedCharacterInput(
+static RT_API_ATTRS bool EditListDirectedCharacterInput(
     IoStatementState &io, CHAR *x, std::size_t length, const DataEdit &edit) {
   std::size_t byteCount{0};
   auto ch{io.GetCurrentChar(byteCount)};
@@ -961,13 +978,13 @@ static bool EditListDirectedCharacterInput(
       remaining = --length > 0 ? maxUTF8Bytes : 0;
     }
   }
-  std::fill_n(x, length, ' ');
+  Fortran::runtime::fill_n(x, length, ' ');
   return true;
 }
 
 template <typename CHAR>
-bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
-    std::size_t lengthChars) {
+RT_API_ATTRS bool EditCharacterInput(IoStatementState &io, const DataEdit &edit,
+    CHAR *x, std::size_t lengthChars) {
   switch (edit.descriptor) {
   case DataEdit::ListDirected:
     return EditListDirectedCharacterInput(io, x, lengthChars, edit);
@@ -1011,7 +1028,7 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
         if (io.CheckForEndOfRecord(readyBytes)) {
           if (readyBytes == 0) {
             // PAD='YES' and no more data
-            std::fill_n(x, lengthChars, ' ');
+            Fortran::runtime::fill_n(x, lengthChars, ' ');
             return !io.GetIoErrorHandler().InError();
           } else {
             // Do partial read(s) then pad on last iteration
@@ -1088,23 +1105,30 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
     readyBytes -= chunkBytes;
   }
   // Pad the remainder of the input variable, if any.
-  std::fill_n(x, lengthChars, ' ');
+  Fortran::runtime::fill_n(x, lengthChars, ' ');
   return CheckCompleteListDirectedField(io, edit);
 }
 
-template bool EditRealInput<2>(IoStatementState &, const DataEdit &, void *);
-template bool EditRealInput<3>(IoStatementState &, const DataEdit &, void *);
-template bool EditRealInput<4>(IoStatementState &, const DataEdit &, void *);
-template bool EditRealInput<8>(IoStatementState &, const DataEdit &, void *);
-template bool EditRealInput<10>(IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<2>(
+    IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<3>(
+    IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<4>(
+    IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<8>(
+    IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<10>(
+    IoStatementState &, const DataEdit &, void *);
 // TODO: double/double
-template bool EditRealInput<16>(IoStatementState &, const DataEdit &, void *);
+template RT_API_ATTRS bool EditRealInput<16>(
+    IoStatementState &, const DataEdit &, void *);
 
-template bool EditCharacterInput(
+template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char *, std::size_t);
-template bool EditCharacterInput(
+template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char16_t *, std::size_t);
-template bool EditCharacterInput(
+template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char32_t *, std::size_t);
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/edit-input.h b/flang/runtime/edit-input.h
index 61844a1199a748..a90180b8ee2ebd 100644
--- a/flang/runtime/edit-input.h
+++ b/flang/runtime/edit-input.h
@@ -15,36 +15,38 @@
 
 namespace Fortran::runtime::io {
 
-bool EditIntegerInput(IoStatementState &, const DataEdit &, void *, int kind);
+RT_API_ATTRS bool EditIntegerInput(
+    IoStatementState &, const DataEdit &, void *, int kind);
 
 template <int KIND>
-bool EditRealInput(IoStatementState &, const DataEdit &, void *);
+RT_API_ATTRS bool EditRealInput(IoStatementState &, const DataEdit &, void *);
 
-bool EditLogicalInput(IoStatementState &, const DataEdit &, bool &);
+RT_API_ATTRS bool EditLogicalInput(
+    IoStatementState &, const DataEdit &, bool &);
 
 template <typename CHAR>
-bool EditCharacterInput(
+RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, CHAR *, std::size_t);
 
-extern template bool EditRealInput<2>(
+extern template RT_API_ATTRS bool EditRealInput<2>(
     IoStatementState &, const DataEdit &, void *);
-extern template bool EditRealInput<3>(
+extern template RT_API_ATTRS bool EditRealInput<3>(
     IoStatementState &, const DataEdit &, void *);
-extern template bool EditRealInput<4>(
+extern template RT_API_ATTRS bool EditRealInput<4>(
     IoStatementState &, const DataEdit &, void *);
-extern template bool EditRealInput<8>(
+extern template RT_API_ATTRS bool EditRealInput<8>(
     IoStatementState &, const DataEdit &, void *);
-extern template bool EditRealInput<10>(
+extern template RT_API_ATTRS bool EditRealInput<10>(
     IoStatementState &, const DataEdit &, void *);
 // TODO: double/double
-extern template bool EditRealInput<16>(
+extern template RT_API_ATTRS bool EditRealInput<16>(
     IoStatementState &, const DataEdit &, void *);
 
-extern template bool EditCharacterInput(
+extern template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char *, std::size_t);
-extern template bool EditCharacterInput(
+extern template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char16_t *, std::size_t);
-extern template bool EditCharacterInput(
+extern template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char32_t *, std::size_t);
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index 7267540370fc07..f3cd94bfe32279 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -14,9 +14,10 @@
 #include <algorithm>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 // In output statement, add a space between numbers and characters.
-static void addSpaceBeforeCharacter(IoStatementState &io) {
+static RT_API_ATTRS void addSpaceBeforeCharacter(IoStatementState &io) {
   if (auto *list{io.get_if<ListDirectedStatementState<Direction::Output>>()}) {
     list->set_lastWasUndelimitedCharacter(false);
   }
@@ -26,8 +27,8 @@ static void addSpaceBeforeCharacter(IoStatementState &io) {
 // representation of what is interpreted to be a single unsigned integer value.
 // When used with character data, endianness is exposed.
 template <int LOG2_BASE>
-static bool EditBOZOutput(IoStatementState &io, const DataEdit &edit,
-    const unsigned char *data0, std::size_t bytes) {
+static RT_API_ATTRS bool EditBOZOutput(IoStatementState &io,
+    const DataEdit &edit, const unsigned char *data0, std::size_t bytes) {
   addSpaceBeforeCharacter(io);
   int digits{static_cast<int>((bytes * 8) / LOG2_BASE)};
   int get{static_cast<int>(bytes * 8) - digits * LOG2_BASE};
@@ -107,7 +108,7 @@ static bool EditBOZOutput(IoStatementState &io, const DataEdit &edit,
 }
 
 template <int KIND>
-bool EditIntegerOutput(IoStatementState &io, const DataEdit &edit,
+bool RT_API_ATTRS EditIntegerOutput(IoStatementState &io, const DataEdit &edit,
     common::HostSignedIntType<8 * KIND> n) {
   addSpaceBeforeCharacter(io);
   char buffer[130], *end{&buffer[sizeof buffer]}, *p{end};
@@ -187,7 +188,7 @@ bool EditIntegerOutput(IoStatementState &io, const DataEdit &edit,
 }
 
 // Formats the exponent (see table 13.1 for all the cases)
-const char *RealOutputEditingBase::FormatExponent(
+RT_API_ATTRS const char *RealOutputEditingBase::FormatExponent(
     int expo, const DataEdit &edit, int &length) {
   char *eEnd{&exponent_[sizeof exponent_]};
   char *exponent{eEnd};
@@ -226,7 +227,7 @@ const char *RealOutputEditingBase::FormatExponent(
   return overflow ? nullptr : exponent;
 }
 
-bool RealOutputEditingBase::EmitPrefix(
+RT_API_ATTRS bool RealOutputEditingBase::EmitPrefix(
     const DataEdit &edit, std::size_t length, std::size_t width) {
   if (edit.IsListDirected()) {
     int prefixLength{edit.descriptor == DataEdit::ListDirectedRealPart ? 2
@@ -247,7 +248,7 @@ bool RealOutputEditingBase::EmitPrefix(
   }
 }
 
-bool RealOutputEditingBase::EmitSuffix(const DataEdit &edit) {
+RT_API_ATTRS bool RealOutputEditingBase::EmitSuffix(const DataEdit &edit) {
   if (edit.descriptor == DataEdit::ListDirectedRealPart) {
     return EmitAscii(
         io_, edit.modes.editingFlags & decimalComma ? ";" : ",", 1);
@@ -259,8 +260,10 @@ bool RealOutputEditingBase::EmitSuffix(const DataEdit &edit) {
 }
 
 template <int KIND>
-decimal::ConversionToDecimalResult RealOutputEditing<KIND>::ConvertToDecimal(
+RT_API_ATTRS decimal::ConversionToDecimalResult
+RealOutputEditing<KIND>::ConvertToDecimal(
     int significantDigits, enum decimal::FortranRounding rounding, int flags) {
+#if !defined(RT_DEVICE_COMPILATION)
   auto converted{decimal::ConvertToDecimal<binaryPrecision>(buffer_,
       sizeof buffer_, static_cast<enum decimal::DecimalConversionFlags>(flags),
       significantDigits, rounding, x_)};
@@ -270,9 +273,13 @@ decimal::ConversionToDecimalResult RealOutputEditing<KIND>::ConvertToDecimal(
         sizeof buffer_);
   }
   return converted;
+#else // defined(RT_DEVICE_COMPILATION)
+  // TODO: enable Decimal library build for the device.
+  io_.GetIoErrorHandler().Crash("not implemented yet: decimal conversion");
+#endif // defined(RT_DEVICE_COMPILATION)
 }
 
-static bool IsInfOrNaN(const char *p, int length) {
+static RT_API_ATTRS bool IsInfOrNaN(const char *p, int length) {
   if (!p || length < 1) {
     return false;
   }
@@ -287,7 +294,8 @@ static bool IsInfOrNaN(const char *p, int length) {
 
 // 13.7.2.3.3 in F'2018
 template <int KIND>
-bool RealOutputEditing<KIND>::EditEorDOutput(const DataEdit &edit) {
+RT_API_ATTRS bool RealOutputEditing<KIND>::EditEorDOutput(
+    const DataEdit &edit) {
   addSpaceBeforeCharacter(io_);
   int editDigits{edit.digits.value_or(0)}; // 'd' field
   int editWidth{edit.width.value_or(0)}; // 'w' field
@@ -423,7 +431,7 @@ bool RealOutputEditing<KIND>::EditEorDOutput(const DataEdit &edit) {
 
 // 13.7.2.3.2 in F'2018
 template <int KIND>
-bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
+RT_API_ATTRS bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
   addSpaceBeforeCharacter(io_);
   int fracDigits{edit.digits.value_or(0)}; // 'd' field
   const int editWidth{edit.width.value_or(0)}; // 'w' field
@@ -553,12 +561,12 @@ bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
 
 // 13.7.5.2.3 in F'2018
 template <int KIND>
-DataEdit RealOutputEditing<KIND>::EditForGOutput(DataEdit edit) {
+RT_API_ATTRS DataEdit RealOutputEditing<KIND>::EditForGOutput(DataEdit edit) {
   edit.descriptor = 'E';
   edit.variation = 'G'; // to suppress error for Ew.0
   int editWidth{edit.width.value_or(0)};
-  int significantDigits{
-      edit.digits.value_or(BinaryFloatingPoint::decimalPrecision)}; // 'd'
+  int significantDigits{edit.digits.value_or(
+      static_cast<int>(BinaryFloatingPoint::decimalPrecision))}; // 'd'
   if (editWidth > 0 && significantDigits == 0) {
     return edit; // Gw.0Ee -> Ew.0Ee for w > 0
   }
@@ -597,7 +605,8 @@ DataEdit RealOutputEditing<KIND>::EditForGOutput(DataEdit edit) {
 
 // 13.10.4 in F'2018
 template <int KIND>
-bool RealOutputEditing<KIND>::EditListDirectedOutput(const DataEdit &edit) {
+RT_API_ATTRS bool RealOutputEditing<KIND>::EditListDirectedOutput(
+    const DataEdit &edit) {
   decimal::ConversionToDecimalResult converted{
       ConvertToDecimal(1, edit.modes.round)};
   if (IsInfOrNaN(converted.str, static_cast<int>(converted.length))) {
@@ -631,7 +640,7 @@ bool RealOutputEditing<KIND>::EditListDirectedOutput(const DataEdit &edit) {
 // E.g., 2. is edited into 0X8.0P-2 rather than 0X2.0P0.  This implementation
 // follows that precedent so as to avoid a gratuitous incompatibility.
 template <int KIND>
-auto RealOutputEditing<KIND>::ConvertToHexadecimal(
+RT_API_ATTRS auto RealOutputEditing<KIND>::ConvertToHexadecimal(
     int significantDigits, enum decimal::FortranRounding rounding, int flags)
     -> ConvertToHexadecimalResult {
   if (x_.IsNaN() || x_.IsInfinite()) {
@@ -689,7 +698,7 @@ auto RealOutputEditing<KIND>::ConvertToHexadecimal(
 }
 
 template <int KIND>
-bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
+RT_API_ATTRS bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
   addSpaceBeforeCharacter(io_);
   int editDigits{edit.digits.value_or(0)}; // 'd' field
   int significantDigits{editDigits + 1};
@@ -740,7 +749,8 @@ bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
           EmitAscii(io_, exponent, expoLength);
 }
 
-template <int KIND> bool RealOutputEditing<KIND>::Edit(const DataEdit &edit) {
+template <int KIND>
+RT_API_ATTRS bool RealOutputEditing<KIND>::Edit(const DataEdit &edit) {
   switch (edit.descriptor) {
   case 'D':
     return EditEorDOutput(edit);
@@ -783,13 +793,14 @@ template <int KIND> bool RealOutputEditing<KIND>::Edit(const DataEdit &edit) {
   return false;
 }
 
-bool ListDirectedLogicalOutput(IoStatementState &io,
+RT_API_ATTRS bool ListDirectedLogicalOutput(IoStatementState &io,
     ListDirectedStatementState<Direction::Output> &list, bool truth) {
   return list.EmitLeadingSpaceOrAdvance(io) &&
       EmitAscii(io, truth ? "T" : "F", 1);
 }
 
-bool EditLogicalOutput(IoStatementState &io, const DataEdit &edit, bool truth) {
+RT_API_ATTRS bool EditLogicalOutput(
+    IoStatementState &io, const DataEdit &edit, bool truth) {
   switch (edit.descriptor) {
   case 'L':
   case 'G':
@@ -813,7 +824,7 @@ bool EditLogicalOutput(IoStatementState &io, const DataEdit &edit, bool truth) {
 }
 
 template <typename CHAR>
-bool ListDirectedCharacterOutput(IoStatementState &io,
+RT_API_ATTRS bool ListDirectedCharacterOutput(IoStatementState &io,
     ListDirectedStatementState<Direction::Output> &list, const CHAR *x,
     std::size_t length) {
   bool ok{true};
@@ -870,8 +881,8 @@ bool ListDirectedCharacterOutput(IoStatementState &io,
 }
 
 template <typename CHAR>
-bool EditCharacterOutput(IoStatementState &io, const DataEdit &edit,
-    const CHAR *x, std::size_t length) {
+RT_API_ATTRS bool EditCharacterOutput(IoStatementState &io,
+    const DataEdit &edit, const CHAR *x, std::size_t length) {
   int len{static_cast<int>(length)};
   int width{edit.width.value_or(len)};
   switch (edit.descriptor) {
@@ -903,15 +914,15 @@ bool EditCharacterOutput(IoStatementState &io, const DataEdit &edit,
       EmitEncoded(io, x, std::min(width, len));
 }
 
-template bool EditIntegerOutput<1>(
+template RT_API_ATTRS bool EditIntegerOutput<1>(
     IoStatementState &, const DataEdit &, std::int8_t);
-template bool EditIntegerOutput<2>(
+template RT_API_ATTRS bool EditIntegerOutput<2>(
     IoStatementState &, const DataEdit &, std::int16_t);
-template bool EditIntegerOutput<4>(
+template RT_API_ATTRS bool EditIntegerOutput<4>(
     IoStatementState &, const DataEdit &, std::int32_t);
-template bool EditIntegerOutput<8>(
+template RT_API_ATTRS bool EditIntegerOutput<8>(
     IoStatementState &, const DataEdit &, std::int64_t);
-template bool EditIntegerOutput<16>(
+template RT_API_ATTRS bool EditIntegerOutput<16>(
     IoStatementState &, const DataEdit &, common::int128_t);
 
 template class RealOutputEditing<2>;
@@ -922,21 +933,22 @@ template class RealOutputEditing<10>;
 // TODO: double/double
 template class RealOutputEditing<16>;
 
-template bool ListDirectedCharacterOutput(IoStatementState &,
+template RT_API_ATTRS bool ListDirectedCharacterOutput(IoStatementState &,
     ListDirectedStatementState<Direction::Output> &, const char *,
     std::size_t chars);
-template bool ListDirectedCharacterOutput(IoStatementState &,
+template RT_API_ATTRS bool ListDirectedCharacterOutput(IoStatementState &,
     ListDirectedStatementState<Direction::Output> &, const char16_t *,
     std::size_t chars);
-template bool ListDirectedCharacterOutput(IoStatementState &,
+template RT_API_ATTRS bool ListDirectedCharacterOutput(IoStatementState &,
     ListDirectedStatementState<Direction::Output> &, const char32_t *,
     std::size_t chars);
 
-template bool EditCharacterOutput(
+template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char *, std::size_t chars);
-template bool EditCharacterOutput(
+template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
-template bool EditCharacterOutput(
+template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/edit-output.h b/flang/runtime/edit-output.h
index 4e6d6b25b4dd2d..365bc2e2a4d10b 100644
--- a/flang/runtime/edit-output.h
+++ b/flang/runtime/edit-output.h
@@ -30,18 +30,20 @@ namespace Fortran::runtime::io {
 // one edit descriptor with a repeat factor may safely serve to edit
 // multiple elements of an array.
 template <int KIND>
-bool EditIntegerOutput(
+RT_API_ATTRS bool EditIntegerOutput(
     IoStatementState &, const DataEdit &, common::HostSignedIntType<8 * KIND>);
 
 // Encapsulates the state of a REAL output conversion.
 class RealOutputEditingBase {
 protected:
-  explicit RealOutputEditingBase(IoStatementState &io) : io_{io} {}
+  explicit RT_API_ATTRS RealOutputEditingBase(IoStatementState &io) : io_{io} {}
 
   // Returns null when the exponent overflows a fixed-size output field.
-  const char *FormatExponent(int, const DataEdit &edit, int &length);
-  bool EmitPrefix(const DataEdit &, std::size_t length, std::size_t width);
-  bool EmitSuffix(const DataEdit &);
+  RT_API_ATTRS const char *FormatExponent(
+      int, const DataEdit &edit, int &length);
+  RT_API_ATTRS bool EmitPrefix(
+      const DataEdit &, std::size_t length, std::size_t width);
+  RT_API_ATTRS bool EmitSuffix(const DataEdit &);
 
   IoStatementState &io_;
   int trailingBlanks_{0}; // created when Gw editing maps to Fw
@@ -50,27 +52,29 @@ class RealOutputEditingBase {
 
 template <int KIND> class RealOutputEditing : public RealOutputEditingBase {
 public:
+  RT_VAR_GROUP_BEGIN
   static constexpr int binaryPrecision{common::PrecisionOfRealKind(KIND)};
+  RT_VAR_GROUP_END
   using BinaryFloatingPoint =
       decimal::BinaryFloatingPointNumber<binaryPrecision>;
   template <typename A>
-  RealOutputEditing(IoStatementState &io, A x)
+  RT_API_ATTRS RealOutputEditing(IoStatementState &io, A x)
       : RealOutputEditingBase{io}, x_{x} {}
-  bool Edit(const DataEdit &);
+  RT_API_ATTRS bool Edit(const DataEdit &);
 
 private:
   // The DataEdit arguments here are const references or copies so that
   // the original DataEdit can safely serve multiple array elements when
   // it has a repeat count.
-  bool EditEorDOutput(const DataEdit &);
-  bool EditFOutput(const DataEdit &);
-  DataEdit EditForGOutput(DataEdit); // returns an E or F edit
-  bool EditEXOutput(const DataEdit &);
-  bool EditListDirectedOutput(const DataEdit &);
+  RT_API_ATTRS bool EditEorDOutput(const DataEdit &);
+  RT_API_ATTRS bool EditFOutput(const DataEdit &);
+  RT_API_ATTRS DataEdit EditForGOutput(DataEdit); // returns an E or F edit
+  RT_API_ATTRS bool EditEXOutput(const DataEdit &);
+  RT_API_ATTRS bool EditListDirectedOutput(const DataEdit &);
 
-  bool IsZero() const { return x_.IsZero(); }
+  RT_API_ATTRS bool IsZero() const { return x_.IsZero(); }
 
-  decimal::ConversionToDecimalResult ConvertToDecimal(
+  RT_API_ATTRS decimal::ConversionToDecimalResult ConvertToDecimal(
       int significantDigits, enum decimal::FortranRounding, int flags = 0);
 
   struct ConvertToHexadecimalResult {
@@ -78,7 +82,7 @@ template <int KIND> class RealOutputEditing : public RealOutputEditingBase {
     int length;
     int exponent;
   };
-  ConvertToHexadecimalResult ConvertToHexadecimal(
+  RT_API_ATTRS ConvertToHexadecimalResult ConvertToHexadecimal(
       int significantDigits, enum decimal::FortranRounding, int flags = 0);
 
   BinaryFloatingPoint x_;
@@ -86,43 +90,43 @@ template <int KIND> class RealOutputEditing : public RealOutputEditingBase {
       EXTRA_DECIMAL_CONVERSION_SPACE];
 };
 
-bool ListDirectedLogicalOutput(
+RT_API_ATTRS bool ListDirectedLogicalOutput(
     IoStatementState &, ListDirectedStatementState<Direction::Output> &, bool);
-bool EditLogicalOutput(IoStatementState &, const DataEdit &, bool);
+RT_API_ATTRS bool EditLogicalOutput(IoStatementState &, const DataEdit &, bool);
 
 template <typename CHAR>
-bool ListDirectedCharacterOutput(IoStatementState &,
+RT_API_ATTRS bool ListDirectedCharacterOutput(IoStatementState &,
     ListDirectedStatementState<Direction::Output> &, const CHAR *,
     std::size_t chars);
-extern template bool ListDirectedCharacterOutput(IoStatementState &,
-    ListDirectedStatementState<Direction::Output> &, const char *,
-    std::size_t chars);
-extern template bool ListDirectedCharacterOutput(IoStatementState &,
-    ListDirectedStatementState<Direction::Output> &, const char16_t *,
-    std::size_t chars);
-extern template bool ListDirectedCharacterOutput(IoStatementState &,
-    ListDirectedStatementState<Direction::Output> &, const char32_t *,
-    std::size_t chars);
+extern template RT_API_ATTRS bool ListDirectedCharacterOutput(
+    IoStatementState &, ListDirectedStatementState<Direction::Output> &,
+    const char *, std::size_t chars);
+extern template RT_API_ATTRS bool ListDirectedCharacterOutput(
+    IoStatementState &, ListDirectedStatementState<Direction::Output> &,
+    const char16_t *, std::size_t chars);
+extern template RT_API_ATTRS bool ListDirectedCharacterOutput(
+    IoStatementState &, ListDirectedStatementState<Direction::Output> &,
+    const char32_t *, std::size_t chars);
 
 template <typename CHAR>
-bool EditCharacterOutput(
+RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const CHAR *, std::size_t chars);
-extern template bool EditCharacterOutput(
+extern template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char *, std::size_t chars);
-extern template bool EditCharacterOutput(
+extern template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
-extern template bool EditCharacterOutput(
+extern template RT_API_ATTRS bool EditCharacterOutput(
     IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
 
-extern template bool EditIntegerOutput<1>(
+extern template RT_API_ATTRS bool EditIntegerOutput<1>(
     IoStatementState &, const DataEdit &, std::int8_t);
-extern template bool EditIntegerOutput<2>(
+extern template RT_API_ATTRS bool EditIntegerOutput<2>(
     IoStatementState &, const DataEdit &, std::int16_t);
-extern template bool EditIntegerOutput<4>(
+extern template RT_API_ATTRS bool EditIntegerOutput<4>(
     IoStatementState &, const DataEdit &, std::int32_t);
-extern template bool EditIntegerOutput<8>(
+extern template RT_API_ATTRS bool EditIntegerOutput<8>(
     IoStatementState &, const DataEdit &, std::int64_t);
-extern template bool EditIntegerOutput<16>(
+extern template RT_API_ATTRS bool EditIntegerOutput<16>(
     IoStatementState &, const DataEdit &, common::int128_t);
 
 extern template class RealOutputEditing<2>;
diff --git a/flang/runtime/emit-encoded.h b/flang/runtime/emit-encoded.h
index 864848c3b19c67..ac8c7d758a0d00 100644
--- a/flang/runtime/emit-encoded.h
+++ b/flang/runtime/emit-encoded.h
@@ -19,7 +19,8 @@
 namespace Fortran::runtime::io {
 
 template <typename CONTEXT, typename CHAR>
-bool EmitEncoded(CONTEXT &to, const CHAR *data, std::size_t chars) {
+RT_API_ATTRS bool EmitEncoded(
+    CONTEXT &to, const CHAR *data, std::size_t chars) {
   ConnectionState &connection{to.GetConnectionState()};
   if (connection.access == Access::Stream &&
       connection.internalIoCharKind == 0) {
@@ -74,7 +75,7 @@ bool EmitEncoded(CONTEXT &to, const CHAR *data, std::size_t chars) {
 }
 
 template <typename CONTEXT>
-bool EmitAscii(CONTEXT &to, const char *data, std::size_t chars) {
+RT_API_ATTRS bool EmitAscii(CONTEXT &to, const char *data, std::size_t chars) {
   ConnectionState &connection{to.GetConnectionState()};
   if (connection.internalIoCharKind <= 1 &&
       connection.access != Access::Stream) {
@@ -85,7 +86,7 @@ bool EmitAscii(CONTEXT &to, const char *data, std::size_t chars) {
 }
 
 template <typename CONTEXT>
-bool EmitRepeated(CONTEXT &to, char ch, std::size_t n) {
+RT_API_ATTRS bool EmitRepeated(CONTEXT &to, char ch, std::size_t n) {
   if (n <= 0) {
     return true;
   }
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index 9bc1158509615f..6c56993fb1d6ec 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -18,6 +18,7 @@ namespace Fortran::runtime {
 
 class Terminator;
 
+RT_OFFLOAD_VAR_GROUP_BEGIN
 #if FLANG_BIG_ENDIAN
 constexpr bool isHostLittleEndian{false};
 #elif FLANG_LITTLE_ENDIAN
@@ -25,6 +26,7 @@ constexpr bool isHostLittleEndian{true};
 #else
 #error host endianness is not known
 #endif
+RT_OFFLOAD_VAR_GROUP_END
 
 // External unformatted I/O data conversions
 enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
index 9d650ceca4a8cc..b48549d54587eb 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang/runtime/external-unit.cpp
@@ -10,14 +10,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "tools.h"
-
-#if !defined(RT_USE_PSEUDO_FILE_UNIT)
-
 #include "io-error.h"
 #include "lock.h"
+#include "tools.h"
 #include "unit-map.h"
 #include "unit.h"
+
+// NOTE: the header files above may define OpenMP declare target
+// variables, so they have to be included unconditionally
+// so that the offload entries are consistent between host and device.
+#if !defined(RT_USE_PSEUDO_FILE_UNIT)
+
 #include <cstdio>
 #include <limits>
 
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index 6ca5776f812a04..67764f1f562624 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "file.h"
+#include "tools.h"
 #include "flang/Runtime/magic-numbers.h"
 #include "flang/Runtime/memory.h"
 #include <algorithm>
@@ -424,6 +425,7 @@ void OpenFile::CloseFd(IoErrorHandler &handler) {
   }
 }
 
+#if !defined(RT_DEVICE_COMPILATION)
 bool IsATerminal(int fd) { return ::isatty(fd); }
 
 #if defined(_WIN32) && !defined(F_OK)
@@ -455,5 +457,25 @@ std::int64_t SizeInBytes(const char *path) {
   // No Fortran compiler signals an error
   return -1;
 }
+#else // defined(RT_DEVICE_COMPILATION)
+bool IsATerminal(int fd) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+bool IsExtant(const char *path) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+bool MayRead(const char *path) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+bool MayWrite(const char *path) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+bool MayReadAndWrite(const char *path) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+std::int64_t SizeInBytes(const char *path) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+#endif // defined(RT_DEVICE_COMPILATION)
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/file.h b/flang/runtime/file.h
index 17deeb0e2f8270..c06acbb9904cc1 100644
--- a/flang/runtime/file.h
+++ b/flang/runtime/file.h
@@ -106,11 +106,11 @@ class OpenFile {
   OwningPtr<Pending> pending_;
 };
 
-bool IsATerminal(int fd);
-bool IsExtant(const char *path);
-bool MayRead(const char *path);
-bool MayWrite(const char *path);
-bool MayReadAndWrite(const char *path);
-std::int64_t SizeInBytes(const char *path);
+RT_API_ATTRS bool IsATerminal(int fd);
+RT_API_ATTRS bool IsExtant(const char *path);
+RT_API_ATTRS bool MayRead(const char *path);
+RT_API_ATTRS bool MayWrite(const char *path);
+RT_API_ATTRS bool MayReadAndWrite(const char *path);
+RT_API_ATTRS std::int64_t SizeInBytes(const char *path);
 } // namespace Fortran::runtime::io
 #endif // FORTRAN_RUNTIME_FILE_H_
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index b84e3208271b75..45d4bd641f6f66 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -25,7 +25,7 @@
 namespace Fortran::runtime::io {
 
 template <typename CONTEXT>
-FormatControl<CONTEXT>::FormatControl(const Terminator &terminator,
+RT_API_ATTRS FormatControl<CONTEXT>::FormatControl(const Terminator &terminator,
     const CharType *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, int maxHeight)
     : maxHeight_{static_cast<std::uint8_t>(maxHeight)}, format_{format},
@@ -63,7 +63,7 @@ FormatControl<CONTEXT>::FormatControl(const Terminator &terminator,
 }
 
 template <typename CONTEXT>
-int FormatControl<CONTEXT>::GetIntField(
+RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
     IoErrorHandler &handler, CharType firstCh, bool *hadError) {
   CharType ch{firstCh ? firstCh : PeekNext()};
   bool negate{ch == '-'};
@@ -114,7 +114,8 @@ int FormatControl<CONTEXT>::GetIntField(
 }
 
 template <typename CONTEXT>
-static void HandleControl(CONTEXT &context, char ch, char next, int n) {
+static RT_API_ATTRS void HandleControl(
+    CONTEXT &context, char ch, char next, int n) {
   MutableModes &modes{context.mutableModes()};
   switch (ch) {
   case 'B':
@@ -221,7 +222,8 @@ static void HandleControl(CONTEXT &context, char ch, char next, int n) {
 // Generally assumes that the format string has survived the common
 // format validator gauntlet.
 template <typename CONTEXT>
-int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
+RT_API_ATTRS int FormatControl<CONTEXT>::CueUpNextDataEdit(
+    Context &context, bool stop) {
   bool hitUnlimitedLoopEnd{false};
   // Do repetitions remain on an unparenthesized data edit?
   while (height_ > 1 && format_[stack_[height_ - 1].start] != '(') {
@@ -419,8 +421,8 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
 
 // Returns the next data edit descriptor
 template <typename CONTEXT>
-Fortran::common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
-    Context &context, int maxRepeat) {
+RT_API_ATTRS Fortran::common::optional<DataEdit>
+FormatControl<CONTEXT>::GetNextDataEdit(Context &context, int maxRepeat) {
   int repeat{CueUpNextDataEdit(context)};
   auto start{offset_};
   DataEdit edit;
@@ -524,7 +526,7 @@ Fortran::common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
 }
 
 template <typename CONTEXT>
-void FormatControl<CONTEXT>::Finish(Context &context) {
+RT_API_ATTRS void FormatControl<CONTEXT>::Finish(Context &context) {
   CueUpNextDataEdit(context, true /* stop at colon or end of FORMAT */);
   if (freeFormat_) {
     FreeMemory(const_cast<CharType *>(format_));
diff --git a/flang/runtime/format.cpp b/flang/runtime/format.cpp
index f219c29aaed142..433acce4b73739 100644
--- a/flang/runtime/format.cpp
+++ b/flang/runtime/format.cpp
@@ -9,6 +9,7 @@
 #include "format-implementation.h"
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 template class FormatControl<
     InternalFormattedIoStatementState<Direction::Output>>;
 template class FormatControl<
@@ -19,4 +20,5 @@ template class FormatControl<
     ExternalFormattedIoStatementState<Direction::Input>>;
 template class FormatControl<ChildFormattedIoStatementState<Direction::Output>>;
 template class FormatControl<ChildFormattedIoStatementState<Direction::Input>>;
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/format.h b/flang/runtime/format.h
index e7d94559964041..f57cf920448712 100644
--- a/flang/runtime/format.h
+++ b/flang/runtime/format.h
@@ -12,6 +12,7 @@
 #define FORTRAN_RUNTIME_FORMAT_H_
 
 #include "environment.h"
+#include "freestanding-tools.h"
 #include "io-error.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Common/optional.h"
@@ -49,20 +50,21 @@ struct DataEdit {
   char descriptor; // capitalized: one of A, I, B, O, Z, F, E(N/S/X), D, G
 
   // Special internal data edit descriptors for list-directed & NAMELIST I/O
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr char ListDirected{'g'}; // non-COMPLEX list-directed
   static constexpr char ListDirectedRealPart{'r'}; // emit "(r," or "(r;"
   static constexpr char ListDirectedImaginaryPart{'z'}; // emit "z)"
   static constexpr char ListDirectedNullValue{'n'}; // see 13.10.3.2
-  constexpr bool IsListDirected() const {
+  static constexpr char DefinedDerivedType{'d'}; // DT defined I/O
+  RT_OFFLOAD_VAR_GROUP_END
+  constexpr RT_API_ATTRS bool IsListDirected() const {
     return descriptor == ListDirected || descriptor == ListDirectedRealPart ||
         descriptor == ListDirectedImaginaryPart;
   }
-  constexpr bool IsNamelist() const {
+  constexpr RT_API_ATTRS bool IsNamelist() const {
     return IsListDirected() && modes.inNamelist;
   }
 
-  static constexpr char DefinedDerivedType{'d'}; // DT defined I/O
-
   char variation{'\0'}; // N, S, or X for EN, ES, EX; G/l for original G/list
   Fortran::common::optional<int> width; // the 'w' field; optional for A
   Fortran::common::optional<int> digits; // the 'm' or 'd' field
@@ -72,8 +74,10 @@ struct DataEdit {
 
   // "iotype" &/or "v_list" values for a DT'iotype'(v_list)
   // defined I/O data edit descriptor
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr std::size_t maxIoTypeChars{32};
   static constexpr std::size_t maxVListEntries{4};
+  RT_OFFLOAD_VAR_GROUP_END
   std::uint8_t ioTypeChars{0};
   std::uint8_t vListEntries{0};
   char ioType[maxIoTypeChars];
@@ -88,13 +92,13 @@ template <typename CONTEXT> class FormatControl {
   using Context = CONTEXT;
   using CharType = char; // formats are always default kind CHARACTER
 
-  FormatControl() {}
-  FormatControl(const Terminator &, const CharType *format,
+  RT_API_ATTRS FormatControl() {}
+  RT_API_ATTRS FormatControl(const Terminator &, const CharType *format,
       std::size_t formatLength, const Descriptor *formatDescriptor = nullptr,
       int maxHeight = maxMaxHeight);
 
   // For attempting to allocate in a user-supplied stack area
-  static std::size_t GetNeededSize(int maxHeight) {
+  static RT_API_ATTRS std::size_t GetNeededSize(int maxHeight) {
     return sizeof(FormatControl) -
         sizeof(Iteration) * (maxMaxHeight - maxHeight);
   }
@@ -102,14 +106,15 @@ template <typename CONTEXT> class FormatControl {
   // Extracts the next data edit descriptor, handling control edit descriptors
   // along the way.  If maxRepeat==0, this is a peek at the next data edit
   // descriptor.
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       Context &, int maxRepeat = 1);
 
   // Emit any remaining character literals after the last data item (on output)
   // and perform remaining record positioning actions.
-  void Finish(Context &);
+  RT_API_ATTRS void Finish(Context &);
 
 private:
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr std::uint8_t maxMaxHeight{100};
 
   struct Iteration {
@@ -117,19 +122,20 @@ template <typename CONTEXT> class FormatControl {
     int start{0}; // offset in format_ of '(' or a repeated edit descriptor
     int remaining{0}; // while >0, decrement and iterate
   };
+  RT_OFFLOAD_VAR_GROUP_END
 
-  void SkipBlanks() {
+  RT_API_ATTRS void SkipBlanks() {
     while (offset_ < formatLength_ &&
         (format_[offset_] == ' ' || format_[offset_] == '\t' ||
             format_[offset_] == '\v')) {
       ++offset_;
     }
   }
-  CharType PeekNext() {
+  RT_API_ATTRS CharType PeekNext() {
     SkipBlanks();
     return offset_ < formatLength_ ? format_[offset_] : '\0';
   }
-  CharType GetNextChar(IoErrorHandler &handler) {
+  RT_API_ATTRS CharType GetNextChar(IoErrorHandler &handler) {
     SkipBlanks();
     if (offset_ >= formatLength_) {
       if (formatLength_ == 0) {
@@ -143,7 +149,7 @@ template <typename CONTEXT> class FormatControl {
     }
     return format_[offset_++];
   }
-  int GetIntField(
+  RT_API_ATTRS int GetIntField(
       IoErrorHandler &, CharType firstCh = '\0', bool *hadError = nullptr);
 
   // Advances through the FORMAT until the next data edit
@@ -151,13 +157,14 @@ template <typename CONTEXT> class FormatControl {
   // along the way.  Returns the repeat count that appeared
   // before the descriptor (defaulting to 1) and leaves offset_
   // pointing to the data edit.
-  int CueUpNextDataEdit(Context &, bool stop = false);
+  RT_API_ATTRS int CueUpNextDataEdit(Context &, bool stop = false);
 
-  static constexpr CharType Capitalize(CharType ch) {
+  static constexpr RT_API_ATTRS CharType Capitalize(CharType ch) {
     return ch >= 'a' && ch <= 'z' ? ch + 'A' - 'a' : ch;
   }
 
-  void ReportBadFormat(Context &context, const char *msg, int offset) const {
+  RT_API_ATTRS void ReportBadFormat(
+      Context &context, const char *msg, int offset) const {
     if constexpr (std::is_same_v<CharType, char>) {
       // Echo the bad format in the error message, but trim any leading or
       // trailing spaces.
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 682b4c9b89294b..818a4dd53eb762 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -47,14 +47,19 @@
 #define STD_MEMCHR_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_STRCPY_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_STRCPY_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
 // Provides alternative implementation for std::fill_n(), if
 // it is not supported.
-template <typename A>
-static inline RT_API_ATTRS void fill_n(
-    A *start, std::size_t count, const A &value) {
+template <typename A, typename B>
+static inline RT_API_ATTRS std::enable_if_t<std::is_convertible_v<B, A>, void>
+fill_n(A *start, std::size_t count, const B &value) {
   for (std::size_t j{0}; j < count; ++j) {
     start[j] = value;
   }
@@ -157,5 +162,19 @@ static inline RT_API_ATTRS const void *memchr(
 using std::memchr;
 #endif // !STD_MEMCMP_UNSUPPORTED
 
+#if STD_STRCPY_UNSUPPORTED
+// Provides alternative implementation for std::strcpy(), if
+// it is not supported.
+static inline RT_API_ATTRS char *strcpy(char *dest, const char *src) {
+  char *result{dest};
+  do {
+    *dest++ = *src;
+  } while (*src++ != '\0');
+  return result;
+}
+#else // !STD_STRCPY_UNSUPPORTED
+using std::strcpy;
+#endif // !STD_STRCPY_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index 66140e00588723..35766306ccefbe 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -7,15 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "internal-unit.h"
+#include "freestanding-tools.h"
 #include "io-error.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <type_traits>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 template <Direction DIR>
-InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
+RT_API_ATTRS InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
     Scalar scalar, std::size_t length, int kind) {
   internalIoCharKind = kind;
   recordLength = length;
@@ -26,7 +28,7 @@ InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
 }
 
 template <Direction DIR>
-InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
+RT_API_ATTRS InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
     const Descriptor &that, const Terminator &terminator) {
   auto thatType{that.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, thatType.has_value());
@@ -42,7 +44,7 @@ InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
 }
 
 template <Direction DIR>
-bool InternalDescriptorUnit<DIR>::Emit(
+RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
     const char *data, std::size_t bytes, IoErrorHandler &handler) {
   if constexpr (DIR == Direction::Input) {
     handler.Crash("InternalDescriptorUnit<Direction::Input>::Emit() called");
@@ -76,7 +78,7 @@ bool InternalDescriptorUnit<DIR>::Emit(
 }
 
 template <Direction DIR>
-std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
+RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
     const char *&p, IoErrorHandler &handler) {
   if constexpr (DIR == Direction::Output) {
     handler.Crash("InternalDescriptorUnit<Direction::Output>::"
@@ -97,7 +99,8 @@ std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
 }
 
 template <Direction DIR>
-bool InternalDescriptorUnit<DIR>::AdvanceRecord(IoErrorHandler &handler) {
+RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
+    IoErrorHandler &handler) {
   if (currentRecordNumber >= endfileRecordNumber.value_or(0)) {
     if constexpr (DIR == Direction::Input) {
       handler.SignalEnd();
@@ -115,24 +118,25 @@ bool InternalDescriptorUnit<DIR>::AdvanceRecord(IoErrorHandler &handler) {
 }
 
 template <Direction DIR>
-void InternalDescriptorUnit<DIR>::BlankFill(char *at, std::size_t bytes) {
+RT_API_ATTRS void InternalDescriptorUnit<DIR>::BlankFill(
+    char *at, std::size_t bytes) {
   switch (internalIoCharKind) {
   case 2:
-    std::fill_n(reinterpret_cast<char16_t *>(at), bytes / 2,
+    Fortran::runtime::fill_n(reinterpret_cast<char16_t *>(at), bytes / 2,
         static_cast<char16_t>(' '));
     break;
   case 4:
-    std::fill_n(reinterpret_cast<char32_t *>(at), bytes / 4,
+    Fortran::runtime::fill_n(reinterpret_cast<char32_t *>(at), bytes / 4,
         static_cast<char32_t>(' '));
     break;
   default:
-    std::fill_n(at, bytes, ' ');
+    Fortran::runtime::fill_n(at, bytes, ' ');
     break;
   }
 }
 
 template <Direction DIR>
-void InternalDescriptorUnit<DIR>::BlankFillOutputRecord() {
+RT_API_ATTRS void InternalDescriptorUnit<DIR>::BlankFillOutputRecord() {
   if constexpr (DIR == Direction::Output) {
     if (furthestPositionInRecord <
         recordLength.value_or(furthestPositionInRecord)) {
@@ -143,18 +147,21 @@ void InternalDescriptorUnit<DIR>::BlankFillOutputRecord() {
 }
 
 template <Direction DIR>
-void InternalDescriptorUnit<DIR>::BackspaceRecord(IoErrorHandler &handler) {
+RT_API_ATTRS void InternalDescriptorUnit<DIR>::BackspaceRecord(
+    IoErrorHandler &handler) {
   RUNTIME_CHECK(handler, currentRecordNumber > 1);
   --currentRecordNumber;
   BeginRecord();
 }
 
 template <Direction DIR>
-std::int64_t InternalDescriptorUnit<DIR>::InquirePos() {
+RT_API_ATTRS std::int64_t InternalDescriptorUnit<DIR>::InquirePos() {
   return (currentRecordNumber - 1) * recordLength.value_or(0) +
       positionInRecord + 1;
 }
 
 template class InternalDescriptorUnit<Direction::Output>;
 template class InternalDescriptorUnit<Direction::Input>;
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index b536ffb831d550..bcd38b62468af2 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -26,26 +26,28 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
 public:
   using Scalar =
       std::conditional_t<DIR == Direction::Input, const char *, char *>;
-  InternalDescriptorUnit(Scalar, std::size_t chars, int kind);
-  InternalDescriptorUnit(const Descriptor &, const Terminator &);
+  RT_API_ATTRS InternalDescriptorUnit(Scalar, std::size_t chars, int kind);
+  RT_API_ATTRS InternalDescriptorUnit(const Descriptor &, const Terminator &);
 
-  bool Emit(const char *, std::size_t, IoErrorHandler &);
-  std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
-  bool AdvanceRecord(IoErrorHandler &);
-  void BackspaceRecord(IoErrorHandler &);
-  std::int64_t InquirePos();
+  RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
+  RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
+  RT_API_ATTRS std::int64_t InquirePos();
 
 private:
-  Descriptor &descriptor() { return staticDescriptor_.descriptor(); }
-  const Descriptor &descriptor() const {
+  RT_API_ATTRS Descriptor &descriptor() {
     return staticDescriptor_.descriptor();
   }
-  Scalar CurrentRecord() const {
+  RT_API_ATTRS const Descriptor &descriptor() const {
+    return staticDescriptor_.descriptor();
+  }
+  RT_API_ATTRS Scalar CurrentRecord() const {
     return descriptor().template ZeroBasedIndexedElement<char>(
         currentRecordNumber - 1);
   }
-  void BlankFill(char *, std::size_t);
-  void BlankFillOutputRecord();
+  RT_API_ATTRS void BlankFill(char *, std::size_t);
+  RT_API_ATTRS void BlankFillOutputRecord();
 
   StaticDescriptor<maxRank, true /*addendum*/> staticDescriptor_;
 };
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 094db5572f15c2..0f259f4715bf21 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -99,7 +99,7 @@ Cookie IONAME(BeginInternalArrayFormattedInput)(const Descriptor &descriptor,
 }
 
 template <Direction DIR>
-Cookie BeginInternalListIO(
+RT_API_ATTRS Cookie BeginInternalListIO(
     std::conditional_t<DIR == Direction::Input, const char, char> *internal,
     std::size_t internalLength, void ** /*scratchArea*/,
     std::size_t /*scratchBytes*/, const char *sourceFile, int sourceLine) {
@@ -156,8 +156,8 @@ Cookie IONAME(BeginInternalFormattedInput)(const char *internal,
       sourceFile, sourceLine);
 }
 
-static Cookie NoopUnit(const Terminator &terminator, int unitNumber,
-    enum Iostat iostat = IostatOk) {
+static RT_API_ATTRS Cookie NoopUnit(const Terminator &terminator,
+    int unitNumber, enum Iostat iostat = IostatOk) {
   Cookie cookie{&New<NoopStatementState>{terminator}(
       terminator.sourceFileName(), terminator.sourceLine(), unitNumber)
                      .release()
@@ -168,9 +168,9 @@ static Cookie NoopUnit(const Terminator &terminator, int unitNumber,
   return cookie;
 }
 
-static ExternalFileUnit *GetOrCreateUnit(int unitNumber, Direction direction,
-    Fortran::common::optional<bool> isUnformatted, const Terminator &terminator,
-    Cookie &errorCookie) {
+static RT_API_ATTRS ExternalFileUnit *GetOrCreateUnit(int unitNumber,
+    Direction direction, Fortran::common::optional<bool> isUnformatted,
+    const Terminator &terminator, Cookie &errorCookie) {
   if (ExternalFileUnit *
       unit{ExternalFileUnit::LookUpOrCreateAnonymous(
           unitNumber, direction, isUnformatted, terminator)}) {
@@ -183,7 +183,7 @@ static ExternalFileUnit *GetOrCreateUnit(int unitNumber, Direction direction,
 }
 
 template <Direction DIR, template <Direction> class STATE, typename... A>
-Cookie BeginExternalListIO(
+RT_API_ATTRS Cookie BeginExternalListIO(
     int unitNumber, const char *sourceFile, int sourceLine, A &&...xs) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -227,11 +227,13 @@ Cookie BeginExternalListIO(
   }
 }
 
-Cookie IONAME(BeginExternalListOutput)(
+RT_EXT_API_GROUP_BEGIN
+Cookie IODEF(BeginExternalListOutput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalListIO<Direction::Output, ExternalListIoStatementState>(
       unitNumber, sourceFile, sourceLine);
 }
+RT_EXT_API_GROUP_END
 
 Cookie IONAME(BeginExternalListInput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
@@ -1163,7 +1165,8 @@ bool IONAME(OutputInteger16)(Cookie cookie, std::int16_t n) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IONAME(OutputInteger32)(Cookie cookie, std::int32_t n) {
+RT_EXT_API_GROUP_BEGIN
+bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger32")) {
     return false;
   }
@@ -1173,6 +1176,7 @@ bool IONAME(OutputInteger32)(Cookie cookie, std::int32_t n) {
       TypeCategory::Integer, 4, reinterpret_cast<void *>(&n), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
+RT_EXT_API_GROUP_END
 
 bool IONAME(OutputInteger64)(Cookie cookie, std::int64_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger64")) {
@@ -1448,10 +1452,12 @@ bool IONAME(InquireInteger64)(
   return false;
 }
 
-enum Iostat IONAME(EndIoStatement)(Cookie cookie) {
+RT_EXT_API_GROUP_BEGIN
+enum Iostat IODEF(EndIoStatement)(Cookie cookie) {
   IoStatementState &io{*cookie};
   return static_cast<enum Iostat>(io.EndIoStatement());
 }
+RT_EXT_API_GROUP_END
 
 template <typename INT>
 static enum Iostat CheckUnitNumberInRangeImpl(INT unit, bool handleError,
diff --git a/flang/runtime/io-error.cpp b/flang/runtime/io-error.cpp
index c8f6675c60a6c8..02f237f05bea15 100644
--- a/flang/runtime/io-error.cpp
+++ b/flang/runtime/io-error.cpp
@@ -16,6 +16,7 @@
 #include <cstring>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 void IoErrorHandler::SignalError(int iostatOrErrno, const char *msg, ...) {
   // Note that IOMSG= alone without IOSTAT=/END=/EOR=/ERR= does not suffice
@@ -44,12 +45,20 @@ void IoErrorHandler::SignalError(int iostatOrErrno, const char *msg, ...) {
       if (ioStat_ <= 0) {
         ioStat_ = iostatOrErrno; // priority over END=/EOR=
         if (msg && (flags_ & hasIoMsg)) {
+#if !defined(RT_DEVICE_COMPILATION)
           char buffer[256];
           va_list ap;
           va_start(ap, msg);
           std::vsnprintf(buffer, sizeof buffer, msg, ap);
-          ioMsg_ = SaveDefaultCharacter(buffer, std::strlen(buffer) + 1, *this);
           va_end(ap);
+#else
+          const char *buffer = "not implemented yet: IOSTAT with varargs";
+#endif
+          ioMsg_ = SaveDefaultCharacter(
+              buffer, Fortran::runtime::strlen(buffer) + 1, *this);
+#if !defined(RT_DEVICE_COMPILATION)
+          va_end(ap);
+#endif
         }
       }
       return;
@@ -58,15 +67,23 @@ void IoErrorHandler::SignalError(int iostatOrErrno, const char *msg, ...) {
   }
   // I/O error not caught!
   if (msg) {
+#if !defined(RT_DEVICE_COMPILATION)
     va_list ap;
     va_start(ap, msg);
     CrashArgs(msg, ap);
     va_end(ap);
+#else
+    Crash("not implemented yet: IOSTAT with varargs");
+#endif
   } else if (const char *errstr{IostatErrorString(iostatOrErrno)}) {
     Crash(errstr);
   } else {
+#if !defined(RT_DEVICE_COMPILATION)
     Crash("I/O error (errno=%d): %s", iostatOrErrno,
         std::strerror(iostatOrErrno));
+#else
+    Crash("I/O error (errno=%d)", iostatOrErrno);
+#endif
   }
 }
 
@@ -85,8 +102,6 @@ void IoErrorHandler::Forward(
   }
 }
 
-void IoErrorHandler::SignalErrno() { SignalError(errno); }
-
 void IoErrorHandler::SignalEnd() { SignalError(IostatEnd); }
 
 void IoErrorHandler::SignalEor() { SignalError(IostatEor); }
@@ -97,6 +112,10 @@ void IoErrorHandler::SignalPendingError() {
   SignalError(error);
 }
 
+RT_OFFLOAD_API_GROUP_END
+
+void IoErrorHandler::SignalErrno() { SignalError(errno); }
+
 bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
   const char *msg{ioMsg_.get()};
   if (!msg) {
@@ -132,7 +151,7 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
     ToFortranDefaultCharacter(buffer, bufferLength, msg);
     return true;
   } else if (ok) {
-    std::size_t copied{std::strlen(buffer)};
+    std::size_t copied{Fortran::runtime::strlen(buffer)};
     if (copied < bufferLength) {
       std::memset(buffer + copied, ' ', bufferLength - copied);
     }
diff --git a/flang/runtime/io-error.h b/flang/runtime/io-error.h
index 565e7153351e7e..0fe11c9185c0a9 100644
--- a/flang/runtime/io-error.h
+++ b/flang/runtime/io-error.h
@@ -26,14 +26,15 @@ namespace Fortran::runtime::io {
 class IoErrorHandler : public Terminator {
 public:
   using Terminator::Terminator;
-  explicit IoErrorHandler(const Terminator &that) : Terminator{that} {}
-  void HasIoStat() { flags_ |= hasIoStat; }
-  void HasErrLabel() { flags_ |= hasErr; }
-  void HasEndLabel() { flags_ |= hasEnd; }
-  void HasEorLabel() { flags_ |= hasEor; }
-  void HasIoMsg() { flags_ |= hasIoMsg; }
+  explicit RT_API_ATTRS IoErrorHandler(const Terminator &that)
+      : Terminator{that} {}
+  RT_API_ATTRS void HasIoStat() { flags_ |= hasIoStat; }
+  RT_API_ATTRS void HasErrLabel() { flags_ |= hasErr; }
+  RT_API_ATTRS void HasEndLabel() { flags_ |= hasEnd; }
+  RT_API_ATTRS void HasEorLabel() { flags_ |= hasEor; }
+  RT_API_ATTRS void HasIoMsg() { flags_ |= hasIoMsg; }
 
-  bool InError() const {
+  RT_API_ATTRS bool InError() const {
     return ioStat_ != IostatOk || pendingError_ != IostatOk;
   }
 
@@ -41,22 +42,25 @@ class IoErrorHandler : public Terminator {
   // Begin...() API routines before it is known whether they
   // have error handling control list items.  Such statements
   // have an ErroneousIoStatementState with a pending error.
-  void SetPendingError(int iostat) { pendingError_ = iostat; }
+  RT_API_ATTRS void SetPendingError(int iostat) { pendingError_ = iostat; }
 
-  void SignalError(int iostatOrErrno, const char *msg, ...);
-  void SignalError(int iostatOrErrno);
-  template <typename... X> void SignalError(const char *msg, X &&...xs) {
+  RT_API_ATTRS void SignalError(int iostatOrErrno, const char *msg, ...);
+  RT_API_ATTRS void SignalError(int iostatOrErrno);
+  template <typename... X>
+  RT_API_ATTRS void SignalError(const char *msg, X &&...xs) {
     SignalError(IostatGenericError, msg, std::forward<X>(xs)...);
   }
 
-  void Forward(int iostatOrErrno, const char *, std::size_t);
+  RT_API_ATTRS void Forward(int iostatOrErrno, const char *, std::size_t);
 
   void SignalErrno(); // SignalError(errno)
-  void SignalEnd(); // input only; EOF on internal write is an error
-  void SignalEor(); // non-advancing input only; EOR on write is an error
-  void SignalPendingError();
+  RT_API_ATTRS void
+  SignalEnd(); // input only; EOF on internal write is an error
+  RT_API_ATTRS void
+  SignalEor(); // non-advancing input only; EOR on write is an error
+  RT_API_ATTRS void SignalPendingError();
 
-  int GetIoStat() const { return ioStat_; }
+  RT_API_ATTRS int GetIoStat() const { return ioStat_; }
   bool GetIoMsg(char *, std::size_t);
 
 private:
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index e3f1214324d887..022e4c806bf63b 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -21,6 +21,7 @@
 #include <type_traits>
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
 
 bool IoStatementBase::Emit(const char *, std::size_t, std::size_t) {
   return false;
@@ -44,10 +45,6 @@ Fortran::common::optional<DataEdit> IoStatementBase::GetNextDataEdit(
   return Fortran::common::nullopt;
 }
 
-ExternalFileUnit *IoStatementBase::GetExternalFileUnit() const {
-  return nullptr;
-}
-
 bool IoStatementBase::BeginReadingRecord() { return true; }
 
 void IoStatementBase::FinishReadingRecord() {}
@@ -56,6 +53,12 @@ void IoStatementBase::HandleAbsolutePosition(std::int64_t) {}
 
 void IoStatementBase::HandleRelativePosition(std::int64_t) {}
 
+std::int64_t IoStatementBase::InquirePos() { return 0; }
+
+ExternalFileUnit *IoStatementBase::GetExternalFileUnit() const {
+  return nullptr;
+}
+
 bool IoStatementBase::Inquire(InquiryKeywordHash, char *, std::size_t) {
   return false;
 }
@@ -70,8 +73,6 @@ bool IoStatementBase::Inquire(InquiryKeywordHash, std::int64_t &) {
   return false;
 }
 
-std::int64_t IoStatementBase::InquirePos() { return 0; }
-
 void IoStatementBase::BadInquiryKeywordHashCrash(InquiryKeywordHash inquiry) {
   char buffer[16];
   const char *decode{InquiryKeywordHashDecode(buffer, sizeof buffer, inquiry)};
@@ -142,21 +143,23 @@ std::int64_t InternalIoStatementState<DIR>::InquirePos() {
 }
 
 template <Direction DIR, typename CHAR>
+RT_API_ATTRS
 InternalFormattedIoStatementState<DIR, CHAR>::InternalFormattedIoStatementState(
     Buffer buffer, std::size_t length, const CharType *format,
     std::size_t formatLength, const Descriptor *formatDescriptor,
     const char *sourceFile, int sourceLine)
     : InternalIoStatementState<DIR>{buffer, length, sourceFile, sourceLine},
-      ioStatementState_{*this}, format_{*this, format, formatLength,
-                                    formatDescriptor} {}
+      ioStatementState_{*this},
+      format_{*this, format, formatLength, formatDescriptor} {}
 
 template <Direction DIR, typename CHAR>
+RT_API_ATTRS
 InternalFormattedIoStatementState<DIR, CHAR>::InternalFormattedIoStatementState(
     const Descriptor &d, const CharType *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, const char *sourceFile, int sourceLine)
     : InternalIoStatementState<DIR>{d, sourceFile, sourceLine},
-      ioStatementState_{*this}, format_{*this, format, formatLength,
-                                    formatDescriptor} {}
+      ioStatementState_{*this},
+      format_{*this, format, formatLength, formatDescriptor} {}
 
 template <Direction DIR, typename CHAR>
 void InternalFormattedIoStatementState<DIR, CHAR>::CompleteOperation() {
@@ -1004,7 +1007,9 @@ void ExternalMiscIoStatementState::CompleteOperation() {
   switch (which_) {
   case Flush:
     ext.FlushOutput(*this);
+#if !defined(RT_DEVICE_COMPILATION)
     std::fflush(nullptr); // flushes C stdio output streams (12.9(2))
+#endif
     break;
   case Backspace:
     ext.BackspaceRecord(*this);
@@ -1508,4 +1513,5 @@ int ErroneousIoStatementState::EndIoStatement() {
   return IoStatementBase::EndIoStatement();
 }
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index e00d54980aae59..e0dafa9c763c4f 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -61,8 +61,8 @@ using IoDirectionState = std::conditional_t<D == Direction::Input,
 template <Direction D> class FormattedIoStatementState {};
 template <> class FormattedIoStatementState<Direction::Input> {
 public:
-  std::size_t GetEditDescriptorChars() const;
-  void GotChar(int);
+  RT_API_ATTRS std::size_t GetEditDescriptorChars() const;
+  RT_API_ATTRS void GotChar(int);
 
 private:
   // Account of characters read for edit descriptors (i.e., formatted I/O
@@ -73,7 +73,7 @@ template <> class FormattedIoStatementState<Direction::Input> {
 // The Cookie type in the I/O API is a pointer (for C) to this class.
 class IoStatementState {
 public:
-  template <typename A> explicit IoStatementState(A &x) : u_{x} {}
+  template <typename A> explicit RT_API_ATTRS IoStatementState(A &x) : u_{x} {}
 
   // These member functions each project themselves into the active alternative.
   // They're used by per-data-item routines in the I/O API (e.g., OutputReal64)
@@ -85,34 +85,39 @@ class IoStatementState {
   // It is called by EndIoStatement(), but it can be invoked earlier to
   // catch errors for (e.g.) GetIoMsg() and GetNewUnit().  If called
   // more than once, it is a no-op.
-  void CompleteOperation();
+  RT_API_ATTRS void CompleteOperation();
   // Completes an I/O statement and reclaims storage.
-  int EndIoStatement();
-
-  bool Emit(const char *, std::size_t bytes, std::size_t elementBytes = 0);
-  bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
-  std::size_t GetNextInputBytes(const char *&);
-  bool AdvanceRecord(int = 1);
-  void BackspaceRecord();
-  void HandleRelativePosition(std::int64_t byteOffset);
-  void HandleAbsolutePosition(std::int64_t byteOffset); // for r* in list I/O
-  Fortran::common::optional<DataEdit> GetNextDataEdit(int maxRepeat = 1);
-  ExternalFileUnit *GetExternalFileUnit() const; // null if internal unit
-  bool BeginReadingRecord();
-  void FinishReadingRecord();
-  bool Inquire(InquiryKeywordHash, char *, std::size_t);
-  bool Inquire(InquiryKeywordHash, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t, bool &); // PENDING=
-  bool Inquire(InquiryKeywordHash, std::int64_t &);
-  std::int64_t InquirePos();
-  void GotChar(signed int = 1); // for READ(SIZE=); can be <0
-
-  MutableModes &mutableModes();
-  ConnectionState &GetConnectionState();
-  IoErrorHandler &GetIoErrorHandler() const;
+  RT_API_ATTRS int EndIoStatement();
+
+  RT_API_ATTRS bool Emit(
+      const char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS bool AdvanceRecord(int = 1);
+  RT_API_ATTRS void BackspaceRecord();
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
+  RT_API_ATTRS void HandleAbsolutePosition(
+      std::int64_t byteOffset); // for r* in list I/O
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
+      int maxRepeat = 1);
+  RT_API_ATTRS ExternalFileUnit *
+  GetExternalFileUnit() const; // null if internal unit
+  RT_API_ATTRS bool BeginReadingRecord();
+  RT_API_ATTRS void FinishReadingRecord();
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, char *, std::size_t);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, bool &);
+  RT_API_ATTRS bool Inquire(
+      InquiryKeywordHash, std::int64_t, bool &); // PENDING=
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t &);
+  RT_API_ATTRS std::int64_t InquirePos();
+  RT_API_ATTRS void GotChar(signed int = 1); // for READ(SIZE=); can be <0
+
+  RT_API_ATTRS MutableModes &mutableModes();
+  RT_API_ATTRS ConnectionState &GetConnectionState();
+  RT_API_ATTRS IoErrorHandler &GetIoErrorHandler() const;
 
   // N.B.: this also works with base classes
-  template <typename A> A *get_if() const {
+  template <typename A> RT_API_ATTRS A *get_if() const {
     return common::visit(
         [](auto &x) -> A * {
           if constexpr (std::is_convertible_v<decltype(x.get()), A &>) {
@@ -124,7 +129,8 @@ class IoStatementState {
   }
 
   // Vacant after the end of the current record
-  Fortran::common::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
+  RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
+      std::size_t &byteCount);
 
   // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
   // are always in units of bytes, not characters; the distinction matters
@@ -132,7 +138,7 @@ class IoStatementState {
 
   // For fixed-width fields, return the number of remaining bytes.
   // Skip over leading blanks.
-  Fortran::common::optional<int> CueUpInput(const DataEdit &edit) {
+  RT_API_ATTRS Fortran::common::optional<int> CueUpInput(const DataEdit &edit) {
     Fortran::common::optional<int> remaining;
     if (edit.IsListDirected()) {
       std::size_t byteCount{0};
@@ -150,7 +156,7 @@ class IoStatementState {
     return remaining;
   }
 
-  Fortran::common::optional<char32_t> SkipSpaces(
+  RT_API_ATTRS Fortran::common::optional<char32_t> SkipSpaces(
       Fortran::common::optional<int> &remaining) {
     while (!remaining || *remaining > 0) {
       std::size_t byteCount{0};
@@ -175,15 +181,16 @@ class IoStatementState {
 
   // Acquires the next input character, respecting any applicable field width
   // or separator character.
-  Fortran::common::optional<char32_t> NextInField(
+  RT_API_ATTRS Fortran::common::optional<char32_t> NextInField(
       Fortran::common::optional<int> &remaining, const DataEdit &);
 
   // Detect and signal any end-of-record condition after input.
   // Returns true if at EOR and remaining input should be padded with blanks.
-  bool CheckForEndOfRecord(std::size_t afterReading);
+  RT_API_ATTRS bool CheckForEndOfRecord(std::size_t afterReading);
 
   // Skips spaces, advances records, and ignores NAMELIST comments
-  Fortran::common::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
+  RT_API_ATTRS Fortran::common::optional<char32_t> GetNextNonBlank(
+      std::size_t &byteCount) {
     auto ch{GetCurrentChar(byteCount)};
     bool inNamelist{mutableModes().inNamelist};
     while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
@@ -197,7 +204,8 @@ class IoStatementState {
     return ch;
   }
 
-  template <Direction D> bool CheckFormattedStmtType(const char *name) {
+  template <Direction D>
+  RT_API_ATTRS bool CheckFormattedStmtType(const char *name) {
     if (get_if<FormattedIoStatementState<D>>()) {
       return true;
     } else {
@@ -260,31 +268,33 @@ class IoStatementBase : public IoErrorHandler {
 public:
   using IoErrorHandler::IoErrorHandler;
 
-  bool completedOperation() const { return completedOperation_; }
+  RT_API_ATTRS bool completedOperation() const { return completedOperation_; }
 
-  void CompleteOperation() { completedOperation_ = true; }
-  int EndIoStatement() { return GetIoStat(); }
+  RT_API_ATTRS void CompleteOperation() { completedOperation_ = true; }
+  RT_API_ATTRS int EndIoStatement() { return GetIoStat(); }
 
   // These are default no-op backstops that can be overridden by descendants.
-  bool Emit(const char *, std::size_t bytes, std::size_t elementBytes = 0);
-  bool Receive(char *, std::size_t bytes, std::size_t elementBytes = 0);
-  std::size_t GetNextInputBytes(const char *&);
-  bool AdvanceRecord(int);
-  void BackspaceRecord();
-  void HandleRelativePosition(std::int64_t);
-  void HandleAbsolutePosition(std::int64_t);
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS bool Emit(
+      const char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS bool Receive(
+      char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS bool AdvanceRecord(int);
+  RT_API_ATTRS void BackspaceRecord();
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t);
+  RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
-  ExternalFileUnit *GetExternalFileUnit() const;
-  bool BeginReadingRecord();
-  void FinishReadingRecord();
-  bool Inquire(InquiryKeywordHash, char *, std::size_t);
-  bool Inquire(InquiryKeywordHash, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t &);
-  std::int64_t InquirePos();
+  RT_API_ATTRS ExternalFileUnit *GetExternalFileUnit() const;
+  RT_API_ATTRS bool BeginReadingRecord();
+  RT_API_ATTRS void FinishReadingRecord();
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, char *, std::size_t);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t &);
+  RT_API_ATTRS std::int64_t InquirePos();
 
-  void BadInquiryKeywordHashCrash(InquiryKeywordHash);
+  RT_API_ATTRS void BadInquiryKeywordHashCrash(InquiryKeywordHash);
 
 protected:
   bool completedOperation_{false};
@@ -296,14 +306,14 @@ template <>
 class ListDirectedStatementState<Direction::Output>
     : public FormattedIoStatementState<Direction::Output> {
 public:
-  bool EmitLeadingSpaceOrAdvance(
+  RT_API_ATTRS bool EmitLeadingSpaceOrAdvance(
       IoStatementState &, std::size_t = 1, bool isCharacter = false);
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
-  bool lastWasUndelimitedCharacter() const {
+  RT_API_ATTRS bool lastWasUndelimitedCharacter() const {
     return lastWasUndelimitedCharacter_;
   }
-  void set_lastWasUndelimitedCharacter(bool yes = true) {
+  RT_API_ATTRS void set_lastWasUndelimitedCharacter(bool yes = true) {
     lastWasUndelimitedCharacter_ = yes;
   }
 
@@ -314,20 +324,20 @@ template <>
 class ListDirectedStatementState<Direction::Input>
     : public FormattedIoStatementState<Direction::Input> {
 public:
-  bool inNamelistSequence() const { return inNamelistSequence_; }
-  int EndIoStatement();
+  RT_API_ATTRS bool inNamelistSequence() const { return inNamelistSequence_; }
+  RT_API_ATTRS int EndIoStatement();
 
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
   // when a null value appears.
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
 
   // Each NAMELIST input item is treated like a distinct list-directed
   // input statement.  This member function resets some state so that
   // repetition and null values work correctly for each successive
   // NAMELIST input item.
-  void ResetForNextNamelistItem(bool inNamelistSequence) {
+  RT_API_ATTRS void ResetForNextNamelistItem(bool inNamelistSequence) {
     remaining_ = 0;
     if (repeatPosition_) {
       repeatPosition_->Cancel();
@@ -353,21 +363,22 @@ class InternalIoStatementState : public IoStatementBase,
 public:
   using Buffer =
       std::conditional_t<DIR == Direction::Input, const char *, char *>;
-  InternalIoStatementState(Buffer, std::size_t,
+  RT_API_ATTRS InternalIoStatementState(Buffer, std::size_t,
       const char *sourceFile = nullptr, int sourceLine = 0);
-  InternalIoStatementState(
+  RT_API_ATTRS InternalIoStatementState(
       const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
-  int EndIoStatement();
-
-  bool Emit(const char *data, std::size_t bytes, std::size_t elementBytes = 0);
-  std::size_t GetNextInputBytes(const char *&);
-  bool AdvanceRecord(int = 1);
-  void BackspaceRecord();
-  ConnectionState &GetConnectionState() { return unit_; }
-  MutableModes &mutableModes() { return unit_.modes; }
-  void HandleRelativePosition(std::int64_t);
-  void HandleAbsolutePosition(std::int64_t);
-  std::int64_t InquirePos();
+  RT_API_ATTRS int EndIoStatement();
+
+  RT_API_ATTRS bool Emit(
+      const char *data, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS bool AdvanceRecord(int = 1);
+  RT_API_ATTRS void BackspaceRecord();
+  RT_API_ATTRS ConnectionState &GetConnectionState() { return unit_; }
+  RT_API_ATTRS MutableModes &mutableModes() { return unit_.modes; }
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t);
+  RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
+  RT_API_ATTRS std::int64_t InquirePos();
 
 protected:
   bool free_{true};
@@ -381,17 +392,20 @@ class InternalFormattedIoStatementState
 public:
   using CharType = CHAR;
   using typename InternalIoStatementState<DIR>::Buffer;
-  InternalFormattedIoStatementState(Buffer internal, std::size_t internalLength,
+  RT_API_ATTRS InternalFormattedIoStatementState(Buffer internal,
+      std::size_t internalLength, const CharType *format,
+      std::size_t formatLength, const Descriptor *formatDescriptor = nullptr,
+      const char *sourceFile = nullptr, int sourceLine = 0);
+  RT_API_ATTRS InternalFormattedIoStatementState(const Descriptor &,
       const CharType *format, std::size_t formatLength,
       const Descriptor *formatDescriptor = nullptr,
       const char *sourceFile = nullptr, int sourceLine = 0);
-  InternalFormattedIoStatementState(const Descriptor &, const CharType *format,
-      std::size_t formatLength, const Descriptor *formatDescriptor = nullptr,
-      const char *sourceFile = nullptr, int sourceLine = 0);
-  IoStatementState &ioStatementState() { return ioStatementState_; }
-  void CompleteOperation();
-  int EndIoStatement();
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS IoStatementState &ioStatementState() {
+    return ioStatementState_;
+  }
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -408,14 +422,17 @@ class InternalListIoStatementState : public InternalIoStatementState<DIR>,
                                      public ListDirectedStatementState<DIR> {
 public:
   using typename InternalIoStatementState<DIR>::Buffer;
-  InternalListIoStatementState(Buffer internal, std::size_t internalLength,
-      const char *sourceFile = nullptr, int sourceLine = 0);
-  InternalListIoStatementState(
+  RT_API_ATTRS InternalListIoStatementState(Buffer internal,
+      std::size_t internalLength, const char *sourceFile = nullptr,
+      int sourceLine = 0);
+  RT_API_ATTRS InternalListIoStatementState(
       const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
-  IoStatementState &ioStatementState() { return ioStatementState_; }
+  RT_API_ATTRS IoStatementState &ioStatementState() {
+    return ioStatementState_;
+  }
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
-  void CompleteOperation();
-  int EndIoStatement();
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
 
 private:
   IoStatementState ioStatementState_; // points to *this
@@ -424,16 +441,16 @@ class InternalListIoStatementState : public InternalIoStatementState<DIR>,
 
 class ExternalIoStatementBase : public IoStatementBase {
 public:
-  ExternalIoStatementBase(
+  RT_API_ATTRS ExternalIoStatementBase(
       ExternalFileUnit &, const char *sourceFile = nullptr, int sourceLine = 0);
-  ExternalFileUnit &unit() { return unit_; }
-  MutableModes &mutableModes();
-  ConnectionState &GetConnectionState();
-  int asynchronousID() const { return asynchronousID_; }
-  int EndIoStatement();
-  ExternalFileUnit *GetExternalFileUnit() const { return &unit_; }
-  void SetAsynchronous();
-  std::int64_t InquirePos();
+  RT_API_ATTRS ExternalFileUnit &unit() { return unit_; }
+  RT_API_ATTRS MutableModes &mutableModes();
+  RT_API_ATTRS ConnectionState &GetConnectionState();
+  RT_API_ATTRS int asynchronousID() const { return asynchronousID_; }
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS ExternalFileUnit *GetExternalFileUnit() const { return &unit_; }
+  RT_API_ATTRS void SetAsynchronous();
+  RT_API_ATTRS std::int64_t InquirePos();
 
 private:
   ExternalFileUnit &unit_;
@@ -444,19 +461,20 @@ template <Direction DIR>
 class ExternalIoStatementState : public ExternalIoStatementBase,
                                  public IoDirectionState<DIR> {
 public:
-  ExternalIoStatementState(
+  RT_API_ATTRS ExternalIoStatementState(
       ExternalFileUnit &, const char *sourceFile = nullptr, int sourceLine = 0);
-  MutableModes &mutableModes() { return mutableModes_; }
-  void CompleteOperation();
-  int EndIoStatement();
-  bool Emit(const char *, std::size_t bytes, std::size_t elementBytes = 0);
-  std::size_t GetNextInputBytes(const char *&);
-  bool AdvanceRecord(int = 1);
-  void BackspaceRecord();
-  void HandleRelativePosition(std::int64_t);
-  void HandleAbsolutePosition(std::int64_t);
-  bool BeginReadingRecord();
-  void FinishReadingRecord();
+  RT_API_ATTRS MutableModes &mutableModes() { return mutableModes_; }
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS bool Emit(
+      const char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS bool AdvanceRecord(int = 1);
+  RT_API_ATTRS void BackspaceRecord();
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t);
+  RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
+  RT_API_ATTRS bool BeginReadingRecord();
+  RT_API_ATTRS void FinishReadingRecord();
 
 private:
   // These are forked from ConnectionState's modes at the beginning
@@ -471,12 +489,13 @@ class ExternalFormattedIoStatementState
       public FormattedIoStatementState<DIR> {
 public:
   using CharType = CHAR;
-  ExternalFormattedIoStatementState(ExternalFileUnit &, const CharType *format,
-      std::size_t formatLength, const Descriptor *formatDescriptor = nullptr,
+  RT_API_ATTRS ExternalFormattedIoStatementState(ExternalFileUnit &,
+      const CharType *format, std::size_t formatLength,
+      const Descriptor *formatDescriptor = nullptr,
       const char *sourceFile = nullptr, int sourceLine = 0);
-  void CompleteOperation();
-  int EndIoStatement();
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -491,7 +510,7 @@ class ExternalListIoStatementState : public ExternalIoStatementState<DIR>,
 public:
   using ExternalIoStatementState<DIR>::ExternalIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
-  int EndIoStatement();
+  RT_API_ATTRS int EndIoStatement();
 };
 
 template <Direction DIR>
@@ -499,24 +518,25 @@ class ExternalUnformattedIoStatementState
     : public ExternalIoStatementState<DIR> {
 public:
   using ExternalIoStatementState<DIR>::ExternalIoStatementState;
-  bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
+  RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
 };
 
 template <Direction DIR>
 class ChildIoStatementState : public IoStatementBase,
                               public IoDirectionState<DIR> {
 public:
-  ChildIoStatementState(
+  RT_API_ATTRS ChildIoStatementState(
       ChildIo &, const char *sourceFile = nullptr, int sourceLine = 0);
-  ChildIo &child() { return child_; }
-  MutableModes &mutableModes();
-  ConnectionState &GetConnectionState();
-  ExternalFileUnit *GetExternalFileUnit() const;
-  int EndIoStatement();
-  bool Emit(const char *, std::size_t bytes, std::size_t elementBytes = 0);
-  std::size_t GetNextInputBytes(const char *&);
-  void HandleRelativePosition(std::int64_t);
-  void HandleAbsolutePosition(std::int64_t);
+  RT_API_ATTRS ChildIo &child() { return child_; }
+  RT_API_ATTRS MutableModes &mutableModes();
+  RT_API_ATTRS ConnectionState &GetConnectionState();
+  RT_API_ATTRS ExternalFileUnit *GetExternalFileUnit() const;
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS bool Emit(
+      const char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
+  RT_API_ATTRS void HandleRelativePosition(std::int64_t);
+  RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
 
 private:
   ChildIo &child_;
@@ -527,14 +547,14 @@ class ChildFormattedIoStatementState : public ChildIoStatementState<DIR>,
                                        public FormattedIoStatementState<DIR> {
 public:
   using CharType = CHAR;
-  ChildFormattedIoStatementState(ChildIo &, const CharType *format,
+  RT_API_ATTRS ChildFormattedIoStatementState(ChildIo &, const CharType *format,
       std::size_t formatLength, const Descriptor *formatDescriptor = nullptr,
       const char *sourceFile = nullptr, int sourceLine = 0);
-  MutableModes &mutableModes() { return mutableModes_; }
-  void CompleteOperation();
-  int EndIoStatement();
-  bool AdvanceRecord(int = 1);
-  Fortran::common::optional<DataEdit> GetNextDataEdit(
+  RT_API_ATTRS MutableModes &mutableModes() { return mutableModes_; }
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS bool AdvanceRecord(int = 1);
+  RT_API_ATTRS Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -550,34 +570,42 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>,
 public:
   using ChildIoStatementState<DIR>::ChildIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
-  int EndIoStatement();
+  RT_API_ATTRS int EndIoStatement();
 };
 
 template <Direction DIR>
 class ChildUnformattedIoStatementState : public ChildIoStatementState<DIR> {
 public:
   using ChildIoStatementState<DIR>::ChildIoStatementState;
-  bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
+  RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
 };
 
 // OPEN
 class OpenStatementState : public ExternalIoStatementBase {
 public:
-  OpenStatementState(ExternalFileUnit &unit, bool wasExtant, bool isNewUnit,
-      const char *sourceFile = nullptr, int sourceLine = 0)
+  RT_API_ATTRS OpenStatementState(ExternalFileUnit &unit, bool wasExtant,
+      bool isNewUnit, const char *sourceFile = nullptr, int sourceLine = 0)
       : ExternalIoStatementBase{unit, sourceFile, sourceLine},
         wasExtant_{wasExtant}, isNewUnit_{isNewUnit} {}
-  bool wasExtant() const { return wasExtant_; }
-  void set_status(OpenStatus status) { status_ = status; } // STATUS=
-  void set_path(const char *, std::size_t); // FILE=
-  void set_position(Position position) { position_ = position; } // POSITION=
-  void set_action(Action action) { action_ = action; } // ACTION=
-  void set_convert(Convert convert) { convert_ = convert; } // CONVERT=
-  void set_access(Access access) { access_ = access; } // ACCESS=
-  void set_isUnformatted(bool yes = true) { isUnformatted_ = yes; } // FORM=
-
-  void CompleteOperation();
-  int EndIoStatement();
+  RT_API_ATTRS bool wasExtant() const { return wasExtant_; }
+  RT_API_ATTRS void set_status(OpenStatus status) {
+    status_ = status;
+  } // STATUS=
+  RT_API_ATTRS void set_path(const char *, std::size_t); // FILE=
+  RT_API_ATTRS void set_position(Position position) {
+    position_ = position;
+  } // POSITION=
+  RT_API_ATTRS void set_action(Action action) { action_ = action; } // ACTION=
+  RT_API_ATTRS void set_convert(Convert convert) {
+    convert_ = convert;
+  } // CONVERT=
+  RT_API_ATTRS void set_access(Access access) { access_ = access; } // ACCESS=
+  RT_API_ATTRS void set_isUnformatted(bool yes = true) {
+    isUnformatted_ = yes;
+  } // FORM=
+
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
 
 private:
   bool wasExtant_;
@@ -594,11 +622,11 @@ class OpenStatementState : public ExternalIoStatementBase {
 
 class CloseStatementState : public ExternalIoStatementBase {
 public:
-  CloseStatementState(ExternalFileUnit &unit, const char *sourceFile = nullptr,
-      int sourceLine = 0)
+  RT_API_ATTRS CloseStatementState(ExternalFileUnit &unit,
+      const char *sourceFile = nullptr, int sourceLine = 0)
       : ExternalIoStatementBase{unit, sourceFile, sourceLine} {}
-  void set_status(CloseStatus status) { status_ = status; }
-  int EndIoStatement();
+  RT_API_ATTRS void set_status(CloseStatus status) { status_ = status; }
+  RT_API_ATTRS int EndIoStatement();
 
 private:
   CloseStatus status_{CloseStatus::Keep};
@@ -608,16 +636,18 @@ class CloseStatementState : public ExternalIoStatementBase {
 // and recoverable BACKSPACE(bad unit)
 class NoUnitIoStatementState : public IoStatementBase {
 public:
-  IoStatementState &ioStatementState() { return ioStatementState_; }
-  MutableModes &mutableModes() { return connection_.modes; }
-  ConnectionState &GetConnectionState() { return connection_; }
-  int badUnitNumber() const { return badUnitNumber_; }
-  void CompleteOperation();
-  int EndIoStatement();
+  RT_API_ATTRS IoStatementState &ioStatementState() {
+    return ioStatementState_;
+  }
+  RT_API_ATTRS MutableModes &mutableModes() { return connection_.modes; }
+  RT_API_ATTRS ConnectionState &GetConnectionState() { return connection_; }
+  RT_API_ATTRS int badUnitNumber() const { return badUnitNumber_; }
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
 
 protected:
   template <typename A>
-  NoUnitIoStatementState(A &stmt, const char *sourceFile = nullptr,
+  RT_API_ATTRS NoUnitIoStatementState(A &stmt, const char *sourceFile = nullptr,
       int sourceLine = 0, int badUnitNumber = -1)
       : IoStatementBase{sourceFile, sourceLine}, ioStatementState_{stmt},
         badUnitNumber_{badUnitNumber} {}
@@ -630,10 +660,10 @@ class NoUnitIoStatementState : public IoStatementBase {
 
 class NoopStatementState : public NoUnitIoStatementState {
 public:
-  NoopStatementState(
+  RT_API_ATTRS NoopStatementState(
       const char *sourceFile = nullptr, int sourceLine = 0, int unitNumber = -1)
       : NoUnitIoStatementState{*this, sourceFile, sourceLine, unitNumber} {}
-  void set_status(CloseStatus) {} // discards
+  RT_API_ATTRS void set_status(CloseStatus) {} // discards
 };
 
 extern template class InternalIoStatementState<Direction::Output>;
@@ -674,32 +704,32 @@ extern template class FormatControl<
 
 class InquireUnitState : public ExternalIoStatementBase {
 public:
-  InquireUnitState(ExternalFileUnit &unit, const char *sourceFile = nullptr,
-      int sourceLine = 0);
-  bool Inquire(InquiryKeywordHash, char *, std::size_t);
-  bool Inquire(InquiryKeywordHash, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t &);
+  RT_API_ATTRS InquireUnitState(ExternalFileUnit &unit,
+      const char *sourceFile = nullptr, int sourceLine = 0);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, char *, std::size_t);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t &);
 };
 
 class InquireNoUnitState : public NoUnitIoStatementState {
 public:
-  InquireNoUnitState(const char *sourceFile = nullptr, int sourceLine = 0,
-      int badUnitNumber = -1);
-  bool Inquire(InquiryKeywordHash, char *, std::size_t);
-  bool Inquire(InquiryKeywordHash, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t &);
+  RT_API_ATTRS InquireNoUnitState(const char *sourceFile = nullptr,
+      int sourceLine = 0, int badUnitNumber = -1);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, char *, std::size_t);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t &);
 };
 
 class InquireUnconnectedFileState : public NoUnitIoStatementState {
 public:
-  InquireUnconnectedFileState(OwningPtr<char> &&path,
+  RT_API_ATTRS InquireUnconnectedFileState(OwningPtr<char> &&path,
       const char *sourceFile = nullptr, int sourceLine = 0);
-  bool Inquire(InquiryKeywordHash, char *, std::size_t);
-  bool Inquire(InquiryKeywordHash, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
-  bool Inquire(InquiryKeywordHash, std::int64_t &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, char *, std::size_t);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
+  RT_API_ATTRS bool Inquire(InquiryKeywordHash, std::int64_t &);
 
 private:
   OwningPtr<char> path_; // trimmed and NUL terminated
@@ -708,9 +738,11 @@ class InquireUnconnectedFileState : public NoUnitIoStatementState {
 class InquireIOLengthState : public NoUnitIoStatementState,
                              public OutputStatementState {
 public:
-  InquireIOLengthState(const char *sourceFile = nullptr, int sourceLine = 0);
-  std::size_t bytes() const { return bytes_; }
-  bool Emit(const char *, std::size_t bytes, std::size_t elementBytes = 0);
+  RT_API_ATTRS InquireIOLengthState(
+      const char *sourceFile = nullptr, int sourceLine = 0);
+  RT_API_ATTRS std::size_t bytes() const { return bytes_; }
+  RT_API_ATTRS bool Emit(
+      const char *, std::size_t bytes, std::size_t elementBytes = 0);
 
 private:
   std::size_t bytes_{0};
@@ -719,11 +751,11 @@ class InquireIOLengthState : public NoUnitIoStatementState,
 class ExternalMiscIoStatementState : public ExternalIoStatementBase {
 public:
   enum Which { Flush, Backspace, Endfile, Rewind, Wait };
-  ExternalMiscIoStatementState(ExternalFileUnit &unit, Which which,
+  RT_API_ATTRS ExternalMiscIoStatementState(ExternalFileUnit &unit, Which which,
       const char *sourceFile = nullptr, int sourceLine = 0)
       : ExternalIoStatementBase{unit, sourceFile, sourceLine}, which_{which} {}
-  void CompleteOperation();
-  int EndIoStatement();
+  RT_API_ATTRS void CompleteOperation();
+  RT_API_ATTRS int EndIoStatement();
 
 private:
   Which which_;
@@ -731,15 +763,15 @@ class ExternalMiscIoStatementState : public ExternalIoStatementBase {
 
 class ErroneousIoStatementState : public IoStatementBase {
 public:
-  explicit ErroneousIoStatementState(Iostat iostat,
+  explicit RT_API_ATTRS ErroneousIoStatementState(Iostat iostat,
       ExternalFileUnit *unit = nullptr, const char *sourceFile = nullptr,
       int sourceLine = 0)
       : IoStatementBase{sourceFile, sourceLine}, unit_{unit} {
     SetPendingError(iostat);
   }
-  int EndIoStatement();
-  ConnectionState &GetConnectionState() { return connection_; }
-  MutableModes &mutableModes() { return connection_.modes; }
+  RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS ConnectionState &GetConnectionState() { return connection_; }
+  RT_API_ATTRS MutableModes &mutableModes() { return connection_.modes; }
 
 private:
   ConnectionState connection_;
diff --git a/flang/runtime/iostat.cpp b/flang/runtime/iostat.cpp
index c993b778e9e1f8..39e224cb01286b 100644
--- a/flang/runtime/iostat.cpp
+++ b/flang/runtime/iostat.cpp
@@ -9,6 +9,8 @@
 #include "flang/Runtime/iostat.h"
 
 namespace Fortran::runtime::io {
+RT_OFFLOAD_API_GROUP_BEGIN
+
 const char *IostatErrorString(int iostat) {
   switch (iostat) {
   case IostatOk:
@@ -122,4 +124,6 @@ const char *IostatErrorString(int iostat) {
   }
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/lock.h b/flang/runtime/lock.h
index 61b06a62ff7c88..9f27a8295c468b 100644
--- a/flang/runtime/lock.h
+++ b/flang/runtime/lock.h
@@ -42,10 +42,10 @@ class Lock {
   // The users of Lock class may use it under
   // USE_PTHREADS and otherwise, so it has to provide
   // all the interfaces.
-  void Take() {}
-  bool Try() { return true; }
-  void Drop() {}
-  bool TakeIfNoDeadlock() { return true; }
+  RT_API_ATTRS void Take() {}
+  RT_API_ATTRS bool Try() { return true; }
+  RT_API_ATTRS void Drop() {}
+  RT_API_ATTRS bool TakeIfNoDeadlock() { return true; }
 #elif USE_PTHREADS
   Lock() { pthread_mutex_init(&mutex_, nullptr); }
   ~Lock() { pthread_mutex_destroy(&mutex_); }
@@ -105,8 +105,10 @@ class Lock {
 
 class CriticalSection {
 public:
-  explicit CriticalSection(Lock &lock) : lock_{lock} { lock_.Take(); }
-  ~CriticalSection() { lock_.Drop(); }
+  explicit RT_API_ATTRS CriticalSection(Lock &lock) : lock_{lock} {
+    lock_.Take();
+  }
+  RT_API_ATTRS ~CriticalSection() { lock_.Drop(); }
 
 private:
   Lock &lock_;
diff --git a/flang/runtime/memory.cpp b/flang/runtime/memory.cpp
index aa6ff9723d1a80..de6c4c72fdac14 100644
--- a/flang/runtime/memory.cpp
+++ b/flang/runtime/memory.cpp
@@ -7,15 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/memory.h"
+#include "freestanding-tools.h"
 #include "terminator.h"
 #include "tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
-RT_OFFLOAD_VAR_GROUP_BEGIN
+RT_OFFLOAD_API_GROUP_BEGIN
 
-RT_API_ATTRS void *AllocateMemoryOrCrash(
-    const Terminator &terminator, std::size_t bytes) {
+void *AllocateMemoryOrCrash(const Terminator &terminator, std::size_t bytes) {
   if (void *p{std::malloc(bytes)}) {
     return p;
   }
@@ -27,7 +27,7 @@ RT_API_ATTRS void *AllocateMemoryOrCrash(
   return nullptr;
 }
 
-RT_API_ATTRS void *ReallocateMemoryOrCrash(
+void *ReallocateMemoryOrCrash(
     const Terminator &terminator, void *ptr, std::size_t newByteSize) {
   if (void *p{Fortran::runtime::realloc(ptr, newByteSize)}) {
     return p;
@@ -40,7 +40,7 @@ RT_API_ATTRS void *ReallocateMemoryOrCrash(
   return nullptr;
 }
 
-RT_API_ATTRS void FreeMemory(void *p) { std::free(p); }
+void FreeMemory(void *p) { std::free(p); }
 
-RT_OFFLOAD_VAR_GROUP_END
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index ac9234f4af832b..b502d41a8d5c89 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -31,9 +31,12 @@ bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
   io.CheckFormattedStmtType<Direction::Output>("OutputNamelist");
   io.mutableModes().inNamelist = true;
   ConnectionState &connection{io.GetConnectionState()};
+  // The following lambda definition violates the conding style,
+  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
+
   // Internal function to advance records and convert case
-  const auto EmitUpperCase{[&](const char *prefix, std::size_t prefixLen,
-                               const char *str, char suffix) -> bool {
+  const auto EmitUpperCase = [&](const char *prefix, std::size_t prefixLen,
+                                 const char *str, char suffix) -> bool {
     if ((connection.NeedAdvance(prefixLen) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1))) ||
         !EmitAscii(io, prefix, prefixLen) ||
@@ -49,7 +52,7 @@ bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
       }
     }
     return suffix == ' ' || EmitAscii(io, &suffix, 1);
-  }};
+  };
   // &GROUP
   if (!EmitUpperCase(" &", 2, group.groupName, ' ')) {
     return false;
@@ -294,7 +297,7 @@ static bool HandleSubstring(
       ch = io.GetNextNonBlank(byteCount);
     }
   }
-  if (ch && ch == ':') {
+  if (ch && *ch == ':') {
     io.HandleRelativePosition(byteCount);
     ch = io.GetNextNonBlank(byteCount);
     if (ch) {
@@ -587,6 +590,8 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
@@ -611,4 +616,6 @@ bool IsNamelistNameOrSlash(IoStatementState &io) {
   return false;
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/namelist.h b/flang/runtime/namelist.h
index 9a5da33a907e44..1fdc0eb4076eef 100644
--- a/flang/runtime/namelist.h
+++ b/flang/runtime/namelist.h
@@ -12,6 +12,7 @@
 #define FORTRAN_RUNTIME_NAMELIST_H_
 
 #include "non-tbp-dio.h"
+#include "flang/Runtime/api-attrs.h"
 
 #include <cstddef>
 
@@ -47,7 +48,7 @@ class NamelistGroup {
 // character; for use in disambiguating a name-like value (e.g. F or T) from a
 // NAMELIST group item name and for coping with short arrays.  Always false
 // when not reading a NAMELIST.
-bool IsNamelistNameOrSlash(IoStatementState &);
+RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &);
 
 } // namespace Fortran::runtime::io
 #endif // FORTRAN_RUNTIME_NAMELIST_H_
diff --git a/flang/runtime/non-tbp-dio.h b/flang/runtime/non-tbp-dio.h
index a2030dbfdfe8d3..05038a264ed992 100644
--- a/flang/runtime/non-tbp-dio.h
+++ b/flang/runtime/non-tbp-dio.h
@@ -39,7 +39,7 @@ struct NonTbpDefinedIo {
 };
 
 struct NonTbpDefinedIoTable {
-  const NonTbpDefinedIo *Find(
+  RT_API_ATTRS const NonTbpDefinedIo *Find(
       const typeInfo::DerivedType &, common::DefinedIo) const;
   std::size_t items{0};
   const NonTbpDefinedIo *item{nullptr};
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index 8ea3daaa57bcf8..f093faf55c3f11 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -193,11 +193,6 @@ inline RT_API_ATTRS RESULT Exponent(ARG x) {
   }
 }
 
-// Suppress the warnings about calling __host__-only std::frexp,
-// defined in C++ STD header files, from __device__ code.
-RT_DIAG_PUSH
-RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
-
 // FRACTION (16.9.80)
 template <typename T> inline RT_API_ATTRS T Fraction(T x) {
   if (ISNANTy<T>::compute(x)) {
@@ -212,8 +207,6 @@ template <typename T> inline RT_API_ATTRS T Fraction(T x) {
   }
 }
 
-RT_DIAG_POP
-
 // SET_EXPONENT (16.9.171)
 template <typename T> inline RT_API_ATTRS T SetExponent(T x, std::int64_t p) {
   if (ISNANTy<T>::compute(x)) {
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index b01735dc30e691..08a1223764f393 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -185,7 +185,6 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
   if (!pointer.IsAllocated()) {
     return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
   }
-#if !defined(RT_DEVICE_COMPILATION)
   if (executionEnvironment.checkPointerDeallocation) {
     // Validate the footer.  This should fail if the pointer doesn't
     // span the entire object, or the object was not allocated as a
@@ -201,7 +200,6 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
           terminator, StatBadPointerDeallocation, errMsg, hasStat);
     }
   }
-#endif
   return ReturnError(terminator,
       pointer.Destroy(/*finalize=*/true, /*destroyPointers=*/true, &terminator),
       errMsg, hasStat);
diff --git a/flang/runtime/pseudo-unit.cpp b/flang/runtime/pseudo-unit.cpp
index 8b5f36e2233a47..a57e3a59efa5fc 100644
--- a/flang/runtime/pseudo-unit.cpp
+++ b/flang/runtime/pseudo-unit.cpp
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "io-error.h"
 #include "tools.h"
+#include "unit.h"
 
+// NOTE: the header files above may define OpenMP declare target
+// variables, so they have to be included unconditionally
+// so that the offload entries are consistent between host and device.
 #if defined(RT_USE_PSEUDO_FILE_UNIT)
-
-#include "io-error.h"
-#include "unit.h"
 #include <cstdio>
 
 namespace Fortran::runtime::io {
diff --git a/flang/runtime/terminator.h b/flang/runtime/terminator.h
index 444c68d109eedf..167574c7821b24 100644
--- a/flang/runtime/terminator.h
+++ b/flang/runtime/terminator.h
@@ -67,7 +67,7 @@ class Terminator {
 
   template <typename... Args>
   RT_API_ATTRS void PrintCrashArgs(const char *message, Args... args) const {
-#if RT_DEVICE_COMPILATION
+#if defined(RT_DEVICE_COMPILATION)
     std::printf(message, args...);
 #else
     std::fprintf(stderr, message, args...);
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 67f4775ae0a99b..b5aa307eade815 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -19,17 +19,24 @@
 
 namespace Fortran::runtime::io {
 
-ExternalFileUnit *defaultInput{nullptr}; // unit 5
-ExternalFileUnit *defaultOutput{nullptr}; // unit 6
-ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
+RT_OFFLOAD_VAR_GROUP_BEGIN
+RT_VAR_ATTRS ExternalFileUnit *defaultInput{nullptr}; // unit 5
+RT_VAR_ATTRS ExternalFileUnit *defaultOutput{nullptr}; // unit 6
+RT_VAR_ATTRS ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
+RT_OFFLOAD_VAR_GROUP_END
 
-static inline void SwapEndianness(
+RT_OFFLOAD_API_GROUP_BEGIN
+
+static inline RT_API_ATTRS void SwapEndianness(
     char *data, std::size_t bytes, std::size_t elementBytes) {
   if (elementBytes > 1) {
     auto half{elementBytes >> 1};
     for (std::size_t j{0}; j + elementBytes <= bytes; j += elementBytes) {
       for (std::size_t k{0}; k < half; ++k) {
+        RT_DIAG_PUSH
+        RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
         std::swap(data[j + k], data[j + elementBytes - 1 - k]);
+        RT_DIAG_POP
       }
     }
   }
@@ -475,7 +482,10 @@ bool ExternalFileUnit::SetDirectRec(
 
 void ExternalFileUnit::EndIoStatement() {
   io_.reset();
+  RT_DIAG_PUSH
+  RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
   u_.emplace<std::monostate>();
+  RT_DIAG_POP
   lock_.Drop();
 }
 
@@ -600,7 +610,8 @@ void ExternalFileUnit::BackspaceVariableUnformattedRecord(
 
 // There's no portable memrchr(), unfortunately, and strrchr() would
 // fail on a record with a NUL, so we have to do it the hard way.
-static const char *FindLastNewline(const char *str, std::size_t length) {
+static RT_API_ATTRS const char *FindLastNewline(
+    const char *str, std::size_t length) {
   for (const char *p{str + length}; p >= str; p--) {
     if (*p == '\n') {
       return p;
@@ -741,7 +752,10 @@ std::int32_t ExternalFileUnit::ReadHeaderOrFooter(std::int64_t frameOffset) {
 
 void ChildIo::EndIoStatement() {
   io_.reset();
+  RT_DIAG_PUSH
+  RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
   u_.emplace<std::monostate>();
+  RT_DIAG_POP
 }
 
 Iostat ChildIo::CheckFormattingAndDirection(
@@ -764,4 +778,5 @@ Iostat ChildIo::CheckFormattingAndDirection(
   }
 }
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index 5f854abd42f645..8b7db5cbc90b43 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -33,10 +33,12 @@ class UnitMap;
 class ChildIo;
 class ExternalFileUnit;
 
+RT_OFFLOAD_VAR_GROUP_BEGIN
 // Predefined file units.
-extern ExternalFileUnit *defaultInput; // unit 5
-extern ExternalFileUnit *defaultOutput; // unit 6
-extern ExternalFileUnit *errorOutput; // unit 0 extension
+extern RT_VAR_ATTRS ExternalFileUnit *defaultInput; // unit 5
+extern RT_VAR_ATTRS ExternalFileUnit *defaultOutput; // unit 6
+extern RT_VAR_ATTRS ExternalFileUnit *errorOutput; // unit 0 extension
+RT_OFFLOAD_VAR_GROUP_END
 
 #if defined(RT_USE_PSEUDO_FILE_UNIT)
 // A flavor of OpenFile class that pretends to be a terminal,
@@ -49,34 +51,36 @@ class PseudoOpenFile {
 public:
   using FileOffset = std::int64_t;
 
-  const char *path() const { return nullptr; }
-  std::size_t pathLength() const { return 0; }
-  void set_path(OwningPtr<char> &&, std::size_t bytes) {}
-  bool mayRead() const { return false; }
-  bool mayWrite() const { return true; }
-  bool mayPosition() const { return false; }
-  bool mayAsynchronous() const { return false; }
-  void set_mayAsynchronous(bool yes);
+  RT_API_ATTRS const char *path() const { return nullptr; }
+  RT_API_ATTRS std::size_t pathLength() const { return 0; }
+  RT_API_ATTRS void set_path(OwningPtr<char> &&, std::size_t bytes) {}
+  RT_API_ATTRS bool mayRead() const { return false; }
+  RT_API_ATTRS bool mayWrite() const { return true; }
+  RT_API_ATTRS bool mayPosition() const { return false; }
+  RT_API_ATTRS bool mayAsynchronous() const { return false; }
+  RT_API_ATTRS void set_mayAsynchronous(bool yes);
   // Pretend to be a terminal to force the output
   // at the end of IO statement.
-  bool isTerminal() const { return true; }
-  bool isWindowsTextFile() const { return false; }
-  Fortran::common::optional<FileOffset> knownSize() const;
-  bool IsConnected() const { return false; }
-  void Open(OpenStatus, Fortran::common::optional<Action>, Position,
-      IoErrorHandler &);
-  void Predefine(int fd) {}
-  void Close(CloseStatus, IoErrorHandler &);
-  std::size_t Read(FileOffset, char *, std::size_t minBytes,
+  RT_API_ATTRS bool isTerminal() const { return true; }
+  RT_API_ATTRS bool isWindowsTextFile() const { return false; }
+  RT_API_ATTRS Fortran::common::optional<FileOffset> knownSize() const;
+  RT_API_ATTRS bool IsConnected() const { return false; }
+  RT_API_ATTRS void Open(OpenStatus, Fortran::common::optional<Action>,
+      Position, IoErrorHandler &);
+  RT_API_ATTRS void Predefine(int fd) {}
+  RT_API_ATTRS void Close(CloseStatus, IoErrorHandler &);
+  RT_API_ATTRS std::size_t Read(FileOffset, char *, std::size_t minBytes,
       std::size_t maxBytes, IoErrorHandler &);
-  std::size_t Write(FileOffset, const char *, std::size_t, IoErrorHandler &);
-  void Truncate(FileOffset, IoErrorHandler &);
-  int ReadAsynchronously(FileOffset, char *, std::size_t, IoErrorHandler &);
-  int WriteAsynchronously(
+  RT_API_ATTRS std::size_t Write(
+      FileOffset, const char *, std::size_t, IoErrorHandler &);
+  RT_API_ATTRS void Truncate(FileOffset, IoErrorHandler &);
+  RT_API_ATTRS int ReadAsynchronously(
+      FileOffset, char *, std::size_t, IoErrorHandler &);
+  RT_API_ATTRS int WriteAsynchronously(
       FileOffset, const char *, std::size_t, IoErrorHandler &);
-  void Wait(int id, IoErrorHandler &);
-  void WaitAll(IoErrorHandler &);
-  Position InquirePosition() const;
+  RT_API_ATTRS void Wait(int id, IoErrorHandler &);
+  RT_API_ATTRS void WaitAll(IoErrorHandler &);
+  RT_API_ATTRS Position InquirePosition() const;
 };
 #endif // defined(RT_USE_PSEUDO_FILE_UNIT)
 
@@ -95,44 +99,51 @@ class ExternalFileUnit : public ConnectionState,
 public:
   static constexpr int maxAsyncIds{64 * 16};
 
-  explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {
+  explicit RT_API_ATTRS ExternalFileUnit(int unitNumber)
+      : unitNumber_{unitNumber} {
     isUTF8 = executionEnvironment.defaultUTF8;
     for (int j{0}; 64 * j < maxAsyncIds; ++j) {
       asyncIdAvailable_[j].set();
     }
     asyncIdAvailable_[0].reset(0);
   }
-  ~ExternalFileUnit() {}
+  RT_API_ATTRS ~ExternalFileUnit() {}
 
-  int unitNumber() const { return unitNumber_; }
-  bool swapEndianness() const { return swapEndianness_; }
-  bool createdForInternalChildIo() const { return createdForInternalChildIo_; }
+  RT_API_ATTRS int unitNumber() const { return unitNumber_; }
+  RT_API_ATTRS bool swapEndianness() const { return swapEndianness_; }
+  RT_API_ATTRS bool createdForInternalChildIo() const {
+    return createdForInternalChildIo_;
+  }
 
-  static ExternalFileUnit *LookUp(int unit);
-  static ExternalFileUnit *LookUpOrCreate(
+  static RT_API_ATTRS ExternalFileUnit *LookUp(int unit);
+  static RT_API_ATTRS ExternalFileUnit *LookUpOrCreate(
       int unit, const Terminator &, bool &wasExtant);
-  static ExternalFileUnit *LookUpOrCreateAnonymous(int unit, Direction,
-      Fortran::common::optional<bool> isUnformatted, const Terminator &);
-  static ExternalFileUnit *LookUp(const char *path, std::size_t pathLen);
-  static ExternalFileUnit &CreateNew(int unit, const Terminator &);
-  static ExternalFileUnit *LookUpForClose(int unit);
-  static ExternalFileUnit &NewUnit(const Terminator &, bool forChildIo);
-  static void CloseAll(IoErrorHandler &);
-  static void FlushAll(IoErrorHandler &);
+  static RT_API_ATTRS ExternalFileUnit *LookUpOrCreateAnonymous(int unit,
+      Direction, Fortran::common::optional<bool> isUnformatted,
+      const Terminator &);
+  static RT_API_ATTRS ExternalFileUnit *LookUp(
+      const char *path, std::size_t pathLen);
+  static RT_API_ATTRS ExternalFileUnit &CreateNew(int unit, const Terminator &);
+  static RT_API_ATTRS ExternalFileUnit *LookUpForClose(int unit);
+  static RT_API_ATTRS ExternalFileUnit &NewUnit(
+      const Terminator &, bool forChildIo);
+  static RT_API_ATTRS void CloseAll(IoErrorHandler &);
+  static RT_API_ATTRS void FlushAll(IoErrorHandler &);
 
   // Returns true if an existing unit was closed
-  bool OpenUnit(Fortran::common::optional<OpenStatus>,
+  RT_API_ATTRS bool OpenUnit(Fortran::common::optional<OpenStatus>,
       Fortran::common::optional<Action>, Position, OwningPtr<char> &&path,
       std::size_t pathLength, Convert, IoErrorHandler &);
-  void OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+  RT_API_ATTRS void OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
       Fortran::common::optional<Action>, Position, Convert, IoErrorHandler &);
-  void CloseUnit(CloseStatus, IoErrorHandler &);
-  void DestroyClosed();
+  RT_API_ATTRS void CloseUnit(CloseStatus, IoErrorHandler &);
+  RT_API_ATTRS void DestroyClosed();
 
-  Iostat SetDirection(Direction);
+  RT_API_ATTRS Iostat SetDirection(Direction);
 
   template <typename A, typename... X>
-  IoStatementState &BeginIoStatement(const Terminator &terminator, X &&...xs) {
+  RT_API_ATTRS IoStatementState &BeginIoStatement(
+      const Terminator &terminator, X &&...xs) {
     // Take lock_ and hold it until EndIoStatement().
 #if USE_PTHREADS
     if (!lock_.TakeIfNoDeadlock()) {
@@ -141,7 +152,10 @@ class ExternalFileUnit : public ConnectionState,
 #else
     lock_.Take();
 #endif
+    RT_DIAG_PUSH
+    RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
     A &state{u_.emplace<A>(std::forward<X>(xs)...)};
+    RT_DIAG_POP
     if constexpr (!std::is_same_v<A, OpenStatementState>) {
       state.mutableModes() = ConnectionState::modes;
     }
@@ -150,50 +164,54 @@ class ExternalFileUnit : public ConnectionState,
     return *io_;
   }
 
-  bool Emit(
+  RT_API_ATTRS bool Emit(
       const char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
-  bool Receive(char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
-  std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
-  bool BeginReadingRecord(IoErrorHandler &);
-  void FinishReadingRecord(IoErrorHandler &);
-  bool AdvanceRecord(IoErrorHandler &);
-  void BackspaceRecord(IoErrorHandler &);
-  void FlushOutput(IoErrorHandler &);
-  void FlushIfTerminal(IoErrorHandler &);
-  void Endfile(IoErrorHandler &);
-  void Rewind(IoErrorHandler &);
-  void EndIoStatement();
-  bool SetStreamPos(std::int64_t, IoErrorHandler &); // one-based, for POS=
-  bool SetDirectRec(std::int64_t, IoErrorHandler &); // one-based, for REC=
-  std::int64_t InquirePos() const {
+  RT_API_ATTRS bool Receive(
+      char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
+  RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
+  RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
+  RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
+  RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
+  RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
+  RT_API_ATTRS void FlushOutput(IoErrorHandler &);
+  RT_API_ATTRS void FlushIfTerminal(IoErrorHandler &);
+  RT_API_ATTRS void Endfile(IoErrorHandler &);
+  RT_API_ATTRS void Rewind(IoErrorHandler &);
+  RT_API_ATTRS void EndIoStatement();
+  RT_API_ATTRS bool SetStreamPos(
+      std::int64_t, IoErrorHandler &); // one-based, for POS=
+  RT_API_ATTRS bool SetDirectRec(
+      std::int64_t, IoErrorHandler &); // one-based, for REC=
+  RT_API_ATTRS std::int64_t InquirePos() const {
     // 12.6.2.11 defines POS=1 as the beginning of file
     return frameOffsetInFile_ + recordOffsetInFrame_ + positionInRecord + 1;
   }
 
-  ChildIo *GetChildIo() { return child_.get(); }
-  ChildIo &PushChildIo(IoStatementState &);
-  void PopChildIo(ChildIo &);
+  RT_API_ATTRS ChildIo *GetChildIo() { return child_.get(); }
+  RT_API_ATTRS ChildIo &PushChildIo(IoStatementState &);
+  RT_API_ATTRS void PopChildIo(ChildIo &);
 
-  int GetAsynchronousId(IoErrorHandler &);
-  bool Wait(int);
+  RT_API_ATTRS int GetAsynchronousId(IoErrorHandler &);
+  RT_API_ATTRS bool Wait(int);
 
 private:
-  static UnitMap &CreateUnitMap();
-  static UnitMap &GetUnitMap();
-  const char *FrameNextInput(IoErrorHandler &, std::size_t);
-  void SetPosition(std::int64_t, IoErrorHandler &); // zero-based
-  void BeginSequentialVariableUnformattedInputRecord(IoErrorHandler &);
-  void BeginVariableFormattedInputRecord(IoErrorHandler &);
-  void BackspaceFixedRecord(IoErrorHandler &);
-  void BackspaceVariableUnformattedRecord(IoErrorHandler &);
-  void BackspaceVariableFormattedRecord(IoErrorHandler &);
-  bool SetVariableFormattedRecordLength();
-  void DoImpliedEndfile(IoErrorHandler &);
-  void DoEndfile(IoErrorHandler &);
-  void CommitWrites();
-  bool CheckDirectAccess(IoErrorHandler &);
-  void HitEndOnRead(IoErrorHandler &);
-  std::int32_t ReadHeaderOrFooter(std::int64_t frameOffset);
+  static RT_API_ATTRS UnitMap &CreateUnitMap();
+  static RT_API_ATTRS UnitMap &GetUnitMap();
+  RT_API_ATTRS const char *FrameNextInput(IoErrorHandler &, std::size_t);
+  RT_API_ATTRS void SetPosition(std::int64_t, IoErrorHandler &); // zero-based
+  RT_API_ATTRS void BeginSequentialVariableUnformattedInputRecord(
+      IoErrorHandler &);
+  RT_API_ATTRS void BeginVariableFormattedInputRecord(IoErrorHandler &);
+  RT_API_ATTRS void BackspaceFixedRecord(IoErrorHandler &);
+  RT_API_ATTRS void BackspaceVariableUnformattedRecord(IoErrorHandler &);
+  RT_API_ATTRS void BackspaceVariableFormattedRecord(IoErrorHandler &);
+  RT_API_ATTRS bool SetVariableFormattedRecordLength();
+  RT_API_ATTRS void DoImpliedEndfile(IoErrorHandler &);
+  RT_API_ATTRS void DoEndfile(IoErrorHandler &);
+  RT_API_ATTRS void CommitWrites();
+  RT_API_ATTRS bool CheckDirectAccess(IoErrorHandler &);
+  RT_API_ATTRS void HitEndOnRead(IoErrorHandler &);
+  RT_API_ATTRS std::int32_t ReadHeaderOrFooter(std::int64_t frameOffset);
 
   Lock lock_;
 
@@ -238,23 +256,28 @@ class ExternalFileUnit : public ConnectionState,
 // be a child I/O statement.
 class ChildIo {
 public:
-  ChildIo(IoStatementState &parent, OwningPtr<ChildIo> &&previous)
+  RT_API_ATTRS ChildIo(IoStatementState &parent, OwningPtr<ChildIo> &&previous)
       : parent_{parent}, previous_{std::move(previous)} {}
 
-  IoStatementState &parent() const { return parent_; }
+  RT_API_ATTRS IoStatementState &parent() const { return parent_; }
 
-  void EndIoStatement();
+  RT_API_ATTRS void EndIoStatement();
 
   template <typename A, typename... X>
-  IoStatementState &BeginIoStatement(X &&...xs) {
+  RT_API_ATTRS IoStatementState &BeginIoStatement(X &&...xs) {
+    RT_DIAG_PUSH
+    RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
     A &state{u_.emplace<A>(std::forward<X>(xs)...)};
+    RT_DIAG_POP
     io_.emplace(state);
     return *io_;
   }
 
-  OwningPtr<ChildIo> AcquirePrevious() { return std::move(previous_); }
+  RT_API_ATTRS OwningPtr<ChildIo> AcquirePrevious() {
+    return std::move(previous_);
+  }
 
-  Iostat CheckFormattingAndDirection(bool unformatted, Direction);
+  RT_API_ATTRS Iostat CheckFormattingAndDirection(bool unformatted, Direction);
 
 private:
   IoStatementState &parent_;
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index e9ccc2c04b6b07..9945dc6509ecbd 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -11,7 +11,8 @@
 namespace Fortran::runtime {
 
 // clang-format off
-const std::uint8_t UTF8FirstByteTable[256]{
+RT_OFFLOAD_VAR_GROUP_BEGIN
+const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
   /* 00 - 7F:  7 bit payload in single byte */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -37,8 +38,10 @@ const std::uint8_t UTF8FirstByteTable[256]{
   /* FE:      32 bit payload */ 7,
   /* FF:      invalid */ 0
 };
+RT_OFFLOAD_VAR_GROUP_END
 // clang-format on
 
+RT_OFFLOAD_API_GROUP_BEGIN
 // Non-minimal encodings are accepted.
 Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
@@ -107,5 +110,6 @@ std::size_t EncodeUTF8(char *p0, char32_t ucs) {
     return 7;
   }
 }
+RT_OFFLOAD_API_GROUP_END
 
 } // namespace Fortran::runtime
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 2b4e4f9a188758..29670d54b3eb6f 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -49,20 +49,22 @@ namespace Fortran::runtime {
 
 // Derive the length of a UTF-8 character encoding from its first byte.
 // A zero result signifies an invalid encoding.
-extern const std::uint8_t UTF8FirstByteTable[256];
-static inline std::size_t MeasureUTF8Bytes(char first) {
+RT_OFFLOAD_VAR_GROUP_BEGIN
+extern const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256];
+static constexpr std::size_t maxUTF8Bytes{7};
+RT_OFFLOAD_VAR_GROUP_END
+
+static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
   return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
 }
 
-static constexpr std::size_t maxUTF8Bytes{7};
-
 // Ensure that all bytes are present in sequence in the input buffer
 // before calling; use MeasureUTF8Bytes(first byte) to count them.
-Fortran::common::optional<char32_t> DecodeUTF8(const char *);
+RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);
 
 // Ensure that at least maxUTF8Bytes remain in the output
 // buffer before calling.
-std::size_t EncodeUTF8(char *, char32_t);
+RT_API_ATTRS std::size_t EncodeUTF8(char *, char32_t);
 
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_UTF_H_

>From 1daaf45de940354c5529a6bafe2d802de2bf937f Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Thu, 21 Mar 2024 22:27:45 -0700
Subject: [PATCH 2/4] clang-format

---
 flang/runtime/edit-output.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index f3cd94bfe32279..b710c298babebf 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -641,8 +641,8 @@ RT_API_ATTRS bool RealOutputEditing<KIND>::EditListDirectedOutput(
 // follows that precedent so as to avoid a gratuitous incompatibility.
 template <int KIND>
 RT_API_ATTRS auto RealOutputEditing<KIND>::ConvertToHexadecimal(
-    int significantDigits, enum decimal::FortranRounding rounding, int flags)
-    -> ConvertToHexadecimalResult {
+    int significantDigits, enum decimal::FortranRounding rounding,
+    int flags) -> ConvertToHexadecimalResult {
   if (x_.IsNaN() || x_.IsInfinite()) {
     auto converted{ConvertToDecimal(significantDigits, rounding, flags)};
     return {converted.str, static_cast<int>(converted.length), 0};

>From 729c5db14ffe076570b0fc7685cc7d45d116156b Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Fri, 22 Mar 2024 11:45:12 -0700
Subject: [PATCH 3/4] Moved api-attrs.h to flang/Common.

---
 flang/include/flang/{Runtime => Common}/api-attrs.h | 2 +-
 flang/include/flang/Common/optional.h               | 2 +-
 flang/include/flang/Common/real.h                   | 2 +-
 flang/include/flang/Common/reference-wrapper.h      | 2 +-
 flang/include/flang/Common/restorer.h               | 2 +-
 flang/include/flang/Common/uint128.h                | 2 +-
 flang/include/flang/Common/visit.h                  | 2 +-
 flang/include/flang/Decimal/binary-floating-point.h | 2 +-
 flang/include/flang/Decimal/decimal.h               | 2 +-
 flang/include/flang/ISO_Fortran_binding_wrapper.h   | 4 ++--
 flang/include/flang/Runtime/entry-names.h           | 2 +-
 flang/include/flang/Runtime/iostat.h                | 2 +-
 flang/include/flang/Runtime/memory.h                | 2 +-
 flang/runtime/derived.h                             | 2 +-
 flang/runtime/freestanding-tools.h                  | 2 +-
 flang/runtime/namelist.h                            | 2 +-
 flang/runtime/numeric-templates.h                   | 2 +-
 flang/runtime/stat.h                                | 2 +-
 flang/runtime/terminator.h                          | 2 +-
 19 files changed, 20 insertions(+), 20 deletions(-)
 rename flang/include/flang/{Runtime => Common}/api-attrs.h (98%)

diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Common/api-attrs.h
similarity index 98%
rename from flang/include/flang/Runtime/api-attrs.h
rename to flang/include/flang/Common/api-attrs.h
index 050d2366b8e165..4d069c6097ddfe 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -1,4 +1,4 @@
-/*===-- include/flang/Runtime/api-attrs.h ---------------------------*- C -*-=//
+/*===-- include/flang/Common/api-attrs.h ---------------------------*- C -*-=//
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/include/flang/Common/optional.h b/flang/include/flang/Common/optional.h
index b5623b84dbd369..c0f4278009f40a 100644
--- a/flang/include/flang/Common/optional.h
+++ b/flang/include/flang/Common/optional.h
@@ -26,7 +26,7 @@
 #ifndef FORTRAN_COMMON_OPTIONAL_H
 #define FORTRAN_COMMON_OPTIONAL_H
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <optional>
 #include <type_traits>
 
diff --git a/flang/include/flang/Common/real.h b/flang/include/flang/Common/real.h
index 9ca58bed2dd7c2..49c400b368a2c1 100644
--- a/flang/include/flang/Common/real.h
+++ b/flang/include/flang/Common/real.h
@@ -13,7 +13,7 @@
 // The various representations are distinguished by their binary precisions
 // (number of explicit significand bits and any implicit MSB in the fraction).
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <cinttypes>
 
 namespace Fortran::common {
diff --git a/flang/include/flang/Common/reference-wrapper.h b/flang/include/flang/Common/reference-wrapper.h
index 66f924662d9612..2983754108f95a 100644
--- a/flang/include/flang/Common/reference-wrapper.h
+++ b/flang/include/flang/Common/reference-wrapper.h
@@ -25,7 +25,7 @@
 #ifndef FORTRAN_COMMON_REFERENCE_WRAPPER_H
 #define FORTRAN_COMMON_REFERENCE_WRAPPER_H
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <functional>
 #include <type_traits>
 
diff --git a/flang/include/flang/Common/restorer.h b/flang/include/flang/Common/restorer.h
index 36bf11d09bbb99..0f1bc48620d37e 100644
--- a/flang/include/flang/Common/restorer.h
+++ b/flang/include/flang/Common/restorer.h
@@ -19,7 +19,7 @@
 #ifndef FORTRAN_COMMON_RESTORER_H_
 #define FORTRAN_COMMON_RESTORER_H_
 #include "idioms.h"
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 namespace Fortran::common {
 template <typename A> class Restorer {
 public:
diff --git a/flang/include/flang/Common/uint128.h b/flang/include/flang/Common/uint128.h
index 55841c0d9b9028..821c8c3b08a52f 100644
--- a/flang/include/flang/Common/uint128.h
+++ b/flang/include/flang/Common/uint128.h
@@ -20,7 +20,7 @@
 #endif
 
 #include "leading-zero-bit-count.h"
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <cstdint>
 #include <type_traits>
 
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index 9d9048c8f4bf10..54f8ca70b313c7 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -21,7 +21,7 @@
 #ifndef FORTRAN_COMMON_VISIT_H_
 #define FORTRAN_COMMON_VISIT_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <type_traits>
 #include <variant>
 
diff --git a/flang/include/flang/Decimal/binary-floating-point.h b/flang/include/flang/Decimal/binary-floating-point.h
index 1c8829550043de..4919c1f9d240f4 100644
--- a/flang/include/flang/Decimal/binary-floating-point.h
+++ b/flang/include/flang/Decimal/binary-floating-point.h
@@ -12,9 +12,9 @@
 // Access and manipulate the fields of an IEEE-754 binary
 // floating-point value via a generalized template.
 
+#include "flang/Common/api-attrs.h"
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
-#include "flang/Runtime/api-attrs.h"
 #include <cinttypes>
 #include <climits>
 #include <cstring>
diff --git a/flang/include/flang/Decimal/decimal.h b/flang/include/flang/Decimal/decimal.h
index aeda01c44fa6f6..443163d058e28b 100644
--- a/flang/include/flang/Decimal/decimal.h
+++ b/flang/include/flang/Decimal/decimal.h
@@ -12,7 +12,7 @@
 #ifndef FORTRAN_DECIMAL_DECIMAL_H_
 #define FORTRAN_DECIMAL_DECIMAL_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <stddef.h>
 
 #ifdef __cplusplus
diff --git a/flang/include/flang/ISO_Fortran_binding_wrapper.h b/flang/include/flang/ISO_Fortran_binding_wrapper.h
index 83c974365e3439..37289bdbabd03c 100644
--- a/flang/include/flang/ISO_Fortran_binding_wrapper.h
+++ b/flang/include/flang/ISO_Fortran_binding_wrapper.h
@@ -13,7 +13,7 @@
 /* A thin wrapper around flang/include/ISO_Fortran_binding.h
  * This header file must be included when ISO_Fortran_binding.h
  * definitions/declarations are needed in Flang compiler/runtime
- * sources. The inclusion of Runtime/api-attrs.h below sets up
+ * sources. The inclusion of Common/api-attrs.h below sets up
  * proper values for the macros used in ISO_Fortran_binding.h
  * for the device offload builds.
  * flang/include/ISO_Fortran_binding.h is made a standalone
@@ -23,7 +23,7 @@
 
 /* clang-format off */
 #include <stddef.h>
-#include "Runtime/api-attrs.h"
+#include "Common/api-attrs.h"
 #ifdef __cplusplus
 namespace Fortran {
 namespace ISO {
diff --git a/flang/include/flang/Runtime/entry-names.h b/flang/include/flang/Runtime/entry-names.h
index a233edf8e987dc..68582b92b54941 100644
--- a/flang/include/flang/Runtime/entry-names.h
+++ b/flang/include/flang/Runtime/entry-names.h
@@ -19,7 +19,7 @@
 #ifndef FORTRAN_RUNTIME_ENTRY_NAMES_H
 #define FORTRAN_RUNTIME_ENTRY_NAMES_H
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 
 #ifndef RTNAME
 #define NAME_WITH_PREFIX_AND_REVISION(prefix, revision, name) \
diff --git a/flang/include/flang/Runtime/iostat.h b/flang/include/flang/Runtime/iostat.h
index c3ec8cae858163..6ce7c82b424eb7 100644
--- a/flang/include/flang/Runtime/iostat.h
+++ b/flang/include/flang/Runtime/iostat.h
@@ -11,7 +11,7 @@
 
 #ifndef FORTRAN_RUNTIME_IOSTAT_H_
 #define FORTRAN_RUNTIME_IOSTAT_H_
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include "flang/Runtime/magic-numbers.h"
 namespace Fortran::runtime::io {
 
diff --git a/flang/include/flang/Runtime/memory.h b/flang/include/flang/Runtime/memory.h
index 0f2e7c3904f580..98412a989f890b 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang/include/flang/Runtime/memory.h
@@ -12,7 +12,7 @@
 #ifndef FORTRAN_RUNTIME_MEMORY_H_
 #define FORTRAN_RUNTIME_MEMORY_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <cassert>
 #include <memory>
 #include <type_traits>
diff --git a/flang/runtime/derived.h b/flang/runtime/derived.h
index e43ecc34a31d1b..b4863df8db417c 100644
--- a/flang/runtime/derived.h
+++ b/flang/runtime/derived.h
@@ -11,7 +11,7 @@
 #ifndef FORTRAN_RUNTIME_DERIVED_H_
 #define FORTRAN_RUNTIME_DERIVED_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 
 namespace Fortran::runtime::typeInfo {
 class DerivedType;
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 818a4dd53eb762..451bf13b9fa6da 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -9,7 +9,7 @@
 #ifndef FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
 #define FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang/runtime/namelist.h b/flang/runtime/namelist.h
index 1fdc0eb4076eef..25216a75e9367d 100644
--- a/flang/runtime/namelist.h
+++ b/flang/runtime/namelist.h
@@ -12,7 +12,7 @@
 #define FORTRAN_RUNTIME_NAMELIST_H_
 
 #include "non-tbp-dio.h"
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 
 #include <cstddef>
 
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index f093faf55c3f11..af552f9ddfc0bd 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -20,8 +20,8 @@
 
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/api-attrs.h"
 #include "flang/Common/float128.h"
-#include "flang/Runtime/api-attrs.h"
 #include <cstdint>
 #include <limits>
 
diff --git a/flang/runtime/stat.h b/flang/runtime/stat.h
index 55cdac46eb3a57..4f46f52ecb2941 100644
--- a/flang/runtime/stat.h
+++ b/flang/runtime/stat.h
@@ -11,8 +11,8 @@
 
 #ifndef FORTRAN_RUNTIME_STAT_H_
 #define FORTRAN_RUNTIME_STAT_H_
+#include "flang/Common/api-attrs.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
-#include "flang/Runtime/api-attrs.h"
 #include "flang/Runtime/magic-numbers.h"
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/terminator.h b/flang/runtime/terminator.h
index 167574c7821b24..59a47ce93e7c90 100644
--- a/flang/runtime/terminator.h
+++ b/flang/runtime/terminator.h
@@ -11,7 +11,7 @@
 #ifndef FORTRAN_RUNTIME_TERMINATOR_H_
 #define FORTRAN_RUNTIME_TERMINATOR_H_
 
-#include "flang/Runtime/api-attrs.h"
+#include "flang/Common/api-attrs.h"
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>

>From e982b2ea2cbc2c49ea1bfbb0e45141fb9fcf900a Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Mon, 25 Mar 2024 14:07:42 -0700
Subject: [PATCH 4/4] Removed '-G -g' from CMake file.

---
 flang/runtime/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index d002f68f0ab8e6..335ef370727461 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -254,7 +254,6 @@ if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
       #   'long double' is treated as 'double' in device code
       -Xcudafe --diag_suppress=20208
       -Xcudafe --display_error_number
-      -G -g
       )
   endif()
   set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS



More information about the flang-commits mailing list