[flang-commits] [flang] [flang][runtime] Enable more APIs in the offload build. (PR #76486)

Slava Zakharin via flang-commits flang-commits at lists.llvm.org
Thu Dec 28 13:35:18 PST 2023


https://github.com/vzakhari updated https://github.com/llvm/llvm-project/pull/76486

>From a42a04c585f7e144556db9c1f46f0dbc0cf84384 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 27 Dec 2023 18:45:24 -0800
Subject: [PATCH 1/3] [flang][runtime] Enable more APIs in the offload build.

---
 .../include/flang/Runtime/array-constructor.h |  20 +-
 flang/include/flang/Runtime/character.h       |  76 ++---
 flang/include/flang/Runtime/descriptor.h      |   7 +-
 flang/include/flang/Runtime/inquiry.h         |   8 +-
 flang/include/flang/Runtime/misc-intrinsic.h  |   4 +-
 flang/include/flang/Runtime/pointer.h         |  36 +--
 flang/include/flang/Runtime/ragged.h          |   9 +-
 flang/runtime/CMakeLists.txt                  |  30 +-
 flang/runtime/allocatable.cpp                 |   3 +
 flang/runtime/array-constructor.cpp           |  21 +-
 flang/runtime/character.cpp                   | 128 ++++----
 flang/runtime/copy.cpp                        |   7 +-
 flang/runtime/derived-api.cpp                 |   2 +
 flang/runtime/dot-product.cpp                 |  59 ++--
 flang/runtime/extrema.cpp                     | 275 ++++++++++--------
 flang/runtime/findloc.cpp                     |  73 +++--
 flang/runtime/freestanding-tools.h            |  35 +++
 flang/runtime/inquiry.cpp                     |   8 +-
 flang/runtime/matmul-transpose.cpp            |   4 +
 flang/runtime/matmul.cpp                      |   4 +
 flang/runtime/memory.cpp                      |   8 +-
 flang/runtime/misc-intrinsic.cpp              |  15 +-
 flang/runtime/numeric.cpp                     |   3 +
 flang/runtime/pointer.cpp                     |  34 ++-
 flang/runtime/product.cpp                     |  72 +++--
 flang/runtime/ragged.cpp                      |  23 +-
 flang/runtime/reduction.cpp                   |   2 +
 flang/runtime/sum.cpp                         |   4 +
 flang/runtime/support.cpp                     |   2 +
 29 files changed, 570 insertions(+), 402 deletions(-)

diff --git a/flang/include/flang/Runtime/array-constructor.h b/flang/include/flang/Runtime/array-constructor.h
index 5274a2fc9e08c5..46fc0418c7991e 100644
--- a/flang/include/flang/Runtime/array-constructor.h
+++ b/flang/include/flang/Runtime/array-constructor.h
@@ -21,15 +21,17 @@ namespace Fortran::runtime {
 // Runtime data structure to hold information about the storage of
 // an array constructor being constructed.
 struct ArrayConstructorVector {
-  ArrayConstructorVector(class Descriptor &to, SubscriptValue nextValuePosition,
-      SubscriptValue actualAllocationSize, const char *sourceFile,
-      int sourceLine, bool useValueLengthParameters)
+  RT_API_ATTRS ArrayConstructorVector(class Descriptor &to,
+      SubscriptValue nextValuePosition, SubscriptValue actualAllocationSize,
+      const char *sourceFile, int sourceLine, bool useValueLengthParameters)
       : to{to}, nextValuePosition{nextValuePosition},
         actualAllocationSize{actualAllocationSize}, sourceFile{sourceFile},
-        sourceLine{sourceLine}, useValueLengthParameters_{
-                                    useValueLengthParameters} {}
+        sourceLine{sourceLine},
+        useValueLengthParameters_{useValueLengthParameters} {}
 
-  bool useValueLengthParameters() const { return useValueLengthParameters_; }
+  RT_API_ATTRS bool useValueLengthParameters() const {
+    return useValueLengthParameters_;
+  }
 
   class Descriptor &to;
   SubscriptValue nextValuePosition;
@@ -95,13 +97,13 @@ extern "C" {
 // the target the runtime is compiled for). This avoids the need for the runtime
 // to maintain a state, or to use dynamic allocation for it. "vectorClassSize"
 // is used to validate that lowering allocated enough space for it.
-void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
+void RTDECL(InitArrayConstructorVector)(ArrayConstructorVector &vector,
     Descriptor &to, bool useValueLengthParameters, int vectorClassSize,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Generic API to push any kind of entity into the array constructor (any
 // Fortran type and any rank).
-void RTNAME(PushArrayConstructorValue)(
+void RTDECL(PushArrayConstructorValue)(
     ArrayConstructorVector &vector, const Descriptor &from);
 
 // API to push scalar array constructor value of:
@@ -109,7 +111,7 @@ void RTNAME(PushArrayConstructorValue)(
 //   - or a derived type that has no length parameters, and no allocatable
 //   component (that would require deep copies).
 // It requires no descriptor for the value that is passed via its base address.
-void RTNAME(PushArrayConstructorSimpleScalar)(
+void RTDECL(PushArrayConstructorSimpleScalar)(
     ArrayConstructorVector &vector, void *from);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/character.h b/flang/include/flang/Runtime/character.h
index 768de75b639c55..dd47686fe858fd 100644
--- a/flang/include/flang/Runtime/character.h
+++ b/flang/include/flang/Runtime/character.h
@@ -20,14 +20,16 @@ namespace Fortran::runtime {
 class Descriptor;
 
 template <typename CHAR>
-int CharacterScalarCompare(
+RT_API_ATTRS int CharacterScalarCompare(
     const CHAR *x, const CHAR *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char>(
+extern template RT_API_ATTRS int CharacterScalarCompare<char>(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char16_t>(const char16_t *x,
-    const char16_t *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char32_t>(const char32_t *x,
-    const char32_t *y, std::size_t xChars, std::size_t yChars);
+extern template RT_API_ATTRS int CharacterScalarCompare<char16_t>(
+    const char16_t *x, const char16_t *y, std::size_t xChars,
+    std::size_t yChars);
+extern template RT_API_ATTRS int CharacterScalarCompare<char32_t>(
+    const char32_t *x, const char32_t *y, std::size_t xChars,
+    std::size_t yChars);
 
 extern "C" {
 
@@ -36,12 +38,12 @@ extern "C" {
 // initialized CHARACTER allocatable scalar or array descriptor -- use
 // AllocatableInitCharacter() to set one up.  Crashes when not
 // conforming.  Assumes independence of data.
-void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
+void RTDECL(CharacterConcatenate)(Descriptor &accumulator,
     const Descriptor &from, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Convenience specialization for ASCII scalars concatenation.
-void RTNAME(CharacterConcatenateScalar1)(
+void RTDECL(CharacterConcatenateScalar1)(
     Descriptor &accumulator, const char *from, std::size_t chars);
 
 // CHARACTER comparisons.  The kinds must match.  Like std::memcmp(),
@@ -52,77 +54,77 @@ void RTNAME(CharacterConcatenateScalar1)(
 // N.B.: Calls to the restricted specific intrinsic functions LGE, LGT, LLE,
 // & LLT are converted into calls to these during lowering; they don't have
 // to be able to be passed as actual procedure arguments.
-int RTNAME(CharacterCompareScalar)(const Descriptor &, const Descriptor &);
-int RTNAME(CharacterCompareScalar1)(
+int RTDECL(CharacterCompareScalar)(const Descriptor &, const Descriptor &);
+int RTDECL(CharacterCompareScalar1)(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+int RTDECL(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
     std::size_t xChars, std::size_t yChars);
-int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+int RTDECL(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
     std::size_t xChars, std::size_t yChars);
 
 // General CHARACTER comparison; the result is a LOGICAL(KIND=1) array that
 // is established and populated.
-void RTNAME(CharacterCompare)(
+void RTDECL(CharacterCompare)(
     Descriptor &result, const Descriptor &, const Descriptor &);
 
 // Special-case support for optimized ASCII scalar expressions.
 
 // Copies data from 'rhs' to the remaining space (lhsLength - offset)
 // in 'lhs', if any.  Returns the new offset.  Assumes independence.
-std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+std::size_t RTDECL(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
     std::size_t offset, const char *rhs, std::size_t rhsBytes);
 
 // Appends any necessary spaces to a CHARACTER(KIND=1) scalar.
-void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset);
+void RTDECL(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset);
 
 // Intrinsic functions
 // The result descriptors below are all established by the runtime.
-void RTNAME(Adjustl)(Descriptor &result, const Descriptor &,
+void RTDECL(Adjustl)(Descriptor &result, const Descriptor &,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Adjustr)(Descriptor &result, const Descriptor &,
+void RTDECL(Adjustr)(Descriptor &result, const Descriptor &,
     const char *sourceFile = nullptr, int sourceLine = 0);
-std::size_t RTNAME(LenTrim1)(const char *, std::size_t);
-std::size_t RTNAME(LenTrim2)(const char16_t *, std::size_t);
-std::size_t RTNAME(LenTrim4)(const char32_t *, std::size_t);
-void RTNAME(LenTrim)(Descriptor &result, const Descriptor &, int kind,
+std::size_t RTDECL(LenTrim1)(const char *, std::size_t);
+std::size_t RTDECL(LenTrim2)(const char16_t *, std::size_t);
+std::size_t RTDECL(LenTrim4)(const char32_t *, std::size_t);
+void RTDECL(LenTrim)(Descriptor &result, const Descriptor &, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
+void RTDECL(Repeat)(Descriptor &result, const Descriptor &string,
     std::int64_t ncopies, const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
+void RTDECL(Trim)(Descriptor &result, const Descriptor &string,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-void RTNAME(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
+void RTDECL(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
+void RTDECL(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Index1)(const char *, std::size_t, const char *substring,
+std::size_t RTDECL(Index1)(const char *, std::size_t, const char *substring,
     std::size_t, bool back = false);
-std::size_t RTNAME(Index2)(const char16_t *, std::size_t,
+std::size_t RTDECL(Index2)(const char16_t *, std::size_t,
     const char16_t *substring, std::size_t, bool back = false);
-std::size_t RTNAME(Index4)(const char32_t *, std::size_t,
+std::size_t RTDECL(Index4)(const char32_t *, std::size_t,
     const char32_t *substring, std::size_t, bool back = false);
-void RTNAME(Index)(Descriptor &result, const Descriptor &string,
+void RTDECL(Index)(Descriptor &result, const Descriptor &string,
     const Descriptor &substring, const Descriptor *back /*can be null*/,
     int kind, const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Scan1)(
+std::size_t RTDECL(Scan1)(
     const char *, std::size_t, const char *set, std::size_t, bool back = false);
-std::size_t RTNAME(Scan2)(const char16_t *, std::size_t, const char16_t *set,
+std::size_t RTDECL(Scan2)(const char16_t *, std::size_t, const char16_t *set,
     std::size_t, bool back = false);
-std::size_t RTNAME(Scan4)(const char32_t *, std::size_t, const char32_t *set,
+std::size_t RTDECL(Scan4)(const char32_t *, std::size_t, const char32_t *set,
     std::size_t, bool back = false);
-void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
+void RTDECL(Scan)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back /*can be null*/, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Verify1)(
+std::size_t RTDECL(Verify1)(
     const char *, std::size_t, const char *set, std::size_t, bool back = false);
-std::size_t RTNAME(Verify2)(const char16_t *, std::size_t, const char16_t *set,
+std::size_t RTDECL(Verify2)(const char16_t *, std::size_t, const char16_t *set,
     std::size_t, bool back = false);
-std::size_t RTNAME(Verify4)(const char32_t *, std::size_t, const char32_t *set,
+std::size_t RTDECL(Verify4)(const char32_t *, std::size_t, const char32_t *set,
     std::size_t, bool back = false);
-void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
+void RTDECL(Verify)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back /*can be null*/, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
 }
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index fa68d97769695a..e36b37c1a917eb 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -67,16 +67,16 @@ class Dimension {
   }
   // Do not use this API to cause the LB of an empty dimension
   // to be anything other than 1.  Use SetBounds() instead if you can.
-  Dimension &SetLowerBound(SubscriptValue lower) {
+  RT_API_ATTRS Dimension &SetLowerBound(SubscriptValue lower) {
     raw_.lower_bound = lower;
     return *this;
   }
-  Dimension &SetUpperBound(SubscriptValue upper) {
+  RT_API_ATTRS Dimension &SetUpperBound(SubscriptValue upper) {
     auto lower{raw_.lower_bound};
     raw_.extent = upper >= lower ? upper - lower + 1 : 0;
     return *this;
   }
-  Dimension &SetExtent(SubscriptValue extent) {
+  RT_API_ATTRS Dimension &SetExtent(SubscriptValue extent) {
     raw_.extent = extent;
     return *this;
   }
@@ -467,5 +467,6 @@ class alignas(Descriptor) StaticDescriptor {
 private:
   char storage_[byteSize]{};
 };
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/inquiry.h b/flang/include/flang/Runtime/inquiry.h
index 8d673637b30092..3fe670b0fae31e 100644
--- a/flang/include/flang/Runtime/inquiry.h
+++ b/flang/include/flang/Runtime/inquiry.h
@@ -21,13 +21,13 @@ class Descriptor;
 
 extern "C" {
 
-std::int64_t RTNAME(LboundDim)(const Descriptor &array, int dim,
+std::int64_t RTDECL(LboundDim)(const Descriptor &array, int dim,
     const char *sourceFile = nullptr, int line = 0);
-void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
+void RTDECL(Ubound)(Descriptor &result, const Descriptor &array, int kind,
     const char *sourceFile = nullptr, int line = 0);
-std::int64_t RTNAME(Size)(
+std::int64_t RTDECL(Size)(
     const Descriptor &array, const char *sourceFile = nullptr, int line = 0);
-std::int64_t RTNAME(SizeDim)(const Descriptor &array, int dim,
+std::int64_t RTDECL(SizeDim)(const Descriptor &array, int dim,
     const char *sourceFile = nullptr, int line = 0);
 
 } // extern "C"
diff --git a/flang/include/flang/Runtime/misc-intrinsic.h b/flang/include/flang/Runtime/misc-intrinsic.h
index d4c20539532d4b..73cc9e2023d979 100644
--- a/flang/include/flang/Runtime/misc-intrinsic.h
+++ b/flang/include/flang/Runtime/misc-intrinsic.h
@@ -19,9 +19,9 @@ namespace Fortran::runtime {
 class Descriptor;
 
 extern "C" {
-void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
+void RTDECL(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line);
-void RTNAME(TransferSize)(Descriptor &result, const Descriptor &source,
+void RTDECL(TransferSize)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line,
     std::int64_t size);
 } // extern "C"
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 52ab9482ed953f..6ceb70ebb676d0 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -21,45 +21,45 @@ extern "C" {
 // Data pointer initialization for NULLIFY(), "p=>NULL()`, & for ALLOCATE().
 
 // Initializes a pointer to a disassociated state for NULLIFY() or "p=>NULL()".
-void RTNAME(PointerNullifyIntrinsic)(
+void RTDECL(PointerNullifyIntrinsic)(
     Descriptor &, TypeCategory, int kind, int rank = 0, int corank = 0);
-void RTNAME(PointerNullifyCharacter)(Descriptor &, SubscriptValue length = 0,
+void RTDECL(PointerNullifyCharacter)(Descriptor &, SubscriptValue length = 0,
     int kind = 1, int rank = 0, int corank = 0);
-void RTNAME(PointerNullifyDerived)(
+void RTDECL(PointerNullifyDerived)(
     Descriptor &, const typeInfo::DerivedType &, int rank = 0, int corank = 0);
 
 // Explicitly sets the bounds of an initialized disassociated pointer.
 // The upper cobound is ignored for the last codimension.
-void RTNAME(PointerSetBounds)(
+void RTDECL(PointerSetBounds)(
     Descriptor &, int zeroBasedDim, SubscriptValue lower, SubscriptValue upper);
-void RTNAME(PointerSetCoBounds)(Descriptor &, int zeroBasedCoDim,
+void RTDECL(PointerSetCoBounds)(Descriptor &, int zeroBasedCoDim,
     SubscriptValue lower, SubscriptValue upper = 0);
 
 // Length type parameters are indexed in declaration order; i.e., 0 is the
 // first length type parameter in the deepest base type.  (Not for use
 // with CHARACTER; see above.)
-void RTNAME(PointerSetDerivedLength)(Descriptor &, int which, SubscriptValue);
+void RTDECL(PointerSetDerivedLength)(Descriptor &, int which, SubscriptValue);
 
 // For MOLD= allocation: acquires information from another descriptor
 // to initialize a null data pointer.
-void RTNAME(PointerApplyMold)(
+void RTDECL(PointerApplyMold)(
     Descriptor &, const Descriptor &mold, int rank = 0);
 
 // Data pointer association for "p=>TARGET"
 
 // Associates a scalar pointer with a simple scalar target.
-void RTNAME(PointerAssociateScalar)(Descriptor &, void *);
+void RTDECL(PointerAssociateScalar)(Descriptor &, void *);
 
 // Associates a pointer with a target of the same rank, possibly with new lower
 // bounds, which are passed in a vector whose length must equal the rank.
-void RTNAME(PointerAssociate)(Descriptor &, const Descriptor &target);
-void RTNAME(PointerAssociateLowerBounds)(
+void RTDECL(PointerAssociate)(Descriptor &, const Descriptor &target);
+void RTDECL(PointerAssociateLowerBounds)(
     Descriptor &, const Descriptor &target, const Descriptor &lowerBounds);
 
 // Associates a pointer with a target with bounds remapping.  The target must be
 // simply contiguous &/or of rank 1.  The bounds constitute a [2,newRank]
 // integer array whose columns are [lower bound, upper bound] on each dimension.
-void RTNAME(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
+void RTDECL(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
     const Descriptor &bounds, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
@@ -70,7 +70,7 @@ void RTNAME(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
 // a derived type or CHARACTER value, the explicit value has to match
 // the length type parameter's value.  This API checks that requirement.
 // Returns 0 for success, or the STAT= value on failure with hasStat==true.
-int RTNAME(PointerCheckLengthParameter)(Descriptor &,
+int RTDECL(PointerCheckLengthParameter)(Descriptor &,
     int which /* 0 for CHARACTER length */, SubscriptValue other,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
@@ -83,10 +83,10 @@ int RTNAME(PointerCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the pointer has a
 // derived type, and is always initialized by PointerAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTNAME(PointerAllocate)(Descriptor &, bool hasStat = false,
+int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
-int RTNAME(PointerAllocateSource)(Descriptor &, const Descriptor &source,
+int RTDECL(PointerAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
@@ -95,13 +95,13 @@ int RTNAME(PointerAllocateSource)(Descriptor &, const Descriptor &source,
 // Finalizes elements &/or components as needed. The pointer is left
 // in an initialized disassociated state suitable for reallocation
 // with the same bounds, cobounds, and length type parameters.
-int RTNAME(PointerDeallocate)(Descriptor &, bool hasStat = false,
+int RTDECL(PointerDeallocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Same as PointerDeallocate but also set the dynamic type as the declared type
 // as mentioned in 7.3.2.3 note 7.
-int RTNAME(PointerDeallocatePolymorphic)(Descriptor &,
+int RTDECL(PointerDeallocatePolymorphic)(Descriptor &,
     const typeInfo::DerivedType *, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
@@ -109,10 +109,10 @@ int RTNAME(PointerDeallocatePolymorphic)(Descriptor &,
 // Association inquiries for ASSOCIATED()
 
 // True when the pointer is not disassociated.
-bool RTNAME(PointerIsAssociated)(const Descriptor &);
+bool RTDECL(PointerIsAssociated)(const Descriptor &);
 
 // True when the pointer is associated with a specific target.
-bool RTNAME(PointerIsAssociatedWith)(
+bool RTDECL(PointerIsAssociatedWith)(
     const Descriptor &, const Descriptor *target);
 
 } // extern "C"
diff --git a/flang/include/flang/Runtime/ragged.h b/flang/include/flang/Runtime/ragged.h
index e4b5838212a845..f52a619c55b43e 100644
--- a/flang/include/flang/Runtime/ragged.h
+++ b/flang/include/flang/Runtime/ragged.h
@@ -29,11 +29,6 @@ struct RaggedArrayHeader {
   std::int64_t *extentPointer;
 };
 
-RaggedArrayHeader *RaggedArrayAllocate(
-    RaggedArrayHeader *, bool, std::int64_t, std::int64_t, std::int64_t *);
-
-void RaggedArrayDeallocate(RaggedArrayHeader *);
-
 extern "C" {
 
 // For more on ragged arrays see https://en.wikipedia.org/wiki/Jagged_array. The
@@ -53,12 +48,12 @@ extern "C" {
 // non-negative rank indicates the length of the extentVector, which is a list
 // of non-negative extents. elementSize is the size of a data element in the
 // rectangular space defined by the extentVector.
-void *RTNAME(RaggedArrayAllocate)(void *header, bool isHeader,
+void *RTDECL(RaggedArrayAllocate)(void *header, bool isHeader,
     std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector);
 
 // Runtime helper for deallocation of ragged array buffers. The root header of
 // the ragged array structure is passed to deallocate the entire ragged array.
-void RTNAME(RaggedArrayDeallocate)(void *raggedArrayHeader);
+void RTDECL(RaggedArrayDeallocate)(void *raggedArrayHeader);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 5df94491b06fb8..d6df15b7f6e078 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -90,16 +90,16 @@ set(sources
   array-constructor.cpp
   assign.cpp
   buffer.cpp
+  character.cpp
   command.cpp
   complex-powi.cpp
   complex-reduction.c
-  copy.cpp
-  character.cpp
   connection.cpp
-  derived.cpp
+  copy.cpp
   derived-api.cpp
-  descriptor.cpp
+  derived.cpp
   descriptor-io.cpp
+  descriptor.cpp
   dot-product.cpp
   edit-input.cpp
   edit-output.cpp
@@ -112,10 +112,10 @@ set(sources
   format.cpp
   inquiry.cpp
   internal-unit.cpp
-  iostat.cpp
   io-api.cpp
   io-error.cpp
   io-stmt.cpp
+  iostat.cpp
   main.cpp
   matmul-transpose.cpp
   matmul.cpp
@@ -124,11 +124,11 @@ set(sources
   namelist.cpp
   non-tbp-dio.cpp
   numeric.cpp
+  pointer.cpp
+  product.cpp
   ragged.cpp
   random.cpp
   reduction.cpp
-  pointer.cpp
-  product.cpp
   stat.cpp
   stop.cpp
   sum.cpp
@@ -140,8 +140,8 @@ set(sources
   transformational.cpp
   type-code.cpp
   type-info.cpp
-  unit.cpp
   unit-map.cpp
+  unit.cpp
   utf.cpp
 )
 
@@ -153,13 +153,25 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
 set(supported_files
   ISO_Fortran_binding.cpp
   allocatable.cpp
+  array-constructor.cpp
   assign.cpp
-  derived.cpp
+  character.cpp
+  copy.cpp
   derived-api.cpp
+  derived.cpp
   descriptor.cpp
+  dot-product.cpp
+  extrema.cpp
+  findloc.cpp
+  inquiry.cpp
   matmul-transpose.cpp
   matmul.cpp
+  memory.cpp
+  misc-intrinsic.cpp
   numeric.cpp
+  pointer.cpp
+  product.cpp
+  ragged.cpp
   stat.cpp
   sum.cpp
   support.cpp
diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp
index 143518d8e99e13..e69795e6f824ba 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang/runtime/allocatable.cpp
@@ -18,6 +18,7 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
 void RTDEF(AllocatableInitIntrinsic)(Descriptor &descriptor,
     TypeCategory category, int kind, int rank, int corank) {
@@ -217,5 +218,7 @@ void RTDEF(AllocatableDeallocateNoFinal)(
 }
 
 // TODO: AllocatableCheckLengthParameter
+
+RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 1be302eaaf1ae5..3905c70460987f 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/array-constructor.h"
 #include "derived.h"
 #include "terminator.h"
+#include "tools.h"
 #include "type-info.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/assign.h"
@@ -23,7 +24,7 @@ namespace Fortran::runtime {
 //  REAL(8), INTEGER(8), COMPLEX(4), ...   -> 16 elements.
 //  REAL(16), INTEGER(16), COMPLEX(8), ... -> 8 elements.
 //  Bigger types -> 4 elements.
-static SubscriptValue initialAllocationSize(
+static RT_API_ATTRS SubscriptValue initialAllocationSize(
     SubscriptValue initialNumberOfElements, SubscriptValue elementBytes) {
   // Try to guess an optimal initial allocation size in number of elements to
   // avoid doing too many reallocation.
@@ -36,9 +37,9 @@ static SubscriptValue initialAllocationSize(
   return std::max(numberOfElements, elementsForMinBytes);
 }
 
-static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
-    Terminator &terminator, SubscriptValue previousToElements,
-    SubscriptValue fromElements) {
+static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
+    ArrayConstructorVector &vector, Terminator &terminator,
+    SubscriptValue previousToElements, SubscriptValue fromElements) {
   Descriptor &to{vector.to};
   if (to.IsAllocatable() && !to.IsAllocated()) {
     // The descriptor bounds may already be set here if the array constructor
@@ -73,7 +74,7 @@ static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
       // realloc is undefined with zero new size and ElementBytes() may be null
       // if the character length is null, or if "from" is a zero sized array.
       if (newByteSize > 0) {
-        void *p{std::realloc(to.raw().base_addr, newByteSize)};
+        void *p{Fortran::runtime::realloc(to.raw().base_addr, newByteSize)};
         RUNTIME_CHECK(terminator, p);
         to.set_base_addr(p);
       }
@@ -88,7 +89,9 @@ static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
 }
 
 extern "C" {
-void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(InitArrayConstructorVector)(ArrayConstructorVector &vector,
     Descriptor &to, bool useValueLengthParameters, int vectorClassSize,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
@@ -102,7 +105,7 @@ void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
       actualAllocationSize, sourceFile, sourceLine, useValueLengthParameters};
 }
 
-void RTNAME(PushArrayConstructorValue)(
+void RTDEF(PushArrayConstructorValue)(
     ArrayConstructorVector &vector, const Descriptor &from) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
   Descriptor &to{vector.to};
@@ -166,7 +169,7 @@ void RTNAME(PushArrayConstructorValue)(
   vector.nextValuePosition += fromElements;
 }
 
-void RTNAME(PushArrayConstructorSimpleScalar)(
+void RTDEF(PushArrayConstructorSimpleScalar)(
     ArrayConstructorVector &vector, void *from) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
   Descriptor &to{vector.to};
@@ -176,5 +179,7 @@ void RTNAME(PushArrayConstructorSimpleScalar)(
   std::memcpy(to.Element<char>(subscript), from, to.ElementBytes());
   ++vector.nextValuePosition;
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/character.cpp b/flang/runtime/character.cpp
index 084aa0c9c8b64e..5049247397eb3c 100644
--- a/flang/runtime/character.cpp
+++ b/flang/runtime/character.cpp
@@ -20,7 +20,8 @@
 namespace Fortran::runtime {
 
 template <typename CHAR>
-inline int CompareToBlankPadding(const CHAR *x, std::size_t chars) {
+inline RT_API_ATTRS int CompareToBlankPadding(
+    const CHAR *x, std::size_t chars) {
   using UNSIGNED_CHAR = std::make_unsigned_t<CHAR>;
   const auto blank{static_cast<UNSIGNED_CHAR>(' ')};
   for (; chars-- > 0; ++x) {
@@ -35,13 +36,15 @@ inline int CompareToBlankPadding(const CHAR *x, std::size_t chars) {
   return 0;
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 template <typename CHAR>
-int CharacterScalarCompare(
+RT_API_ATTRS int CharacterScalarCompare(
     const CHAR *x, const CHAR *y, std::size_t xChars, std::size_t yChars) {
   auto minChars{std::min(xChars, yChars)};
   if constexpr (sizeof(CHAR) == 1) {
     // don't use for kind=2 or =4, that would fail on little-endian machines
-    int cmp{std::memcmp(x, y, minChars)};
+    int cmp{Fortran::runtime::memcmp(x, y, minChars)};
     if (cmp < 0) {
       return -1;
     }
@@ -69,20 +72,22 @@ int CharacterScalarCompare(
   return -CompareToBlankPadding(y, yChars - minChars);
 }
 
-template int CharacterScalarCompare<char>(
+template RT_API_ATTRS int CharacterScalarCompare<char>(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-template int CharacterScalarCompare<char16_t>(const char16_t *x,
+template RT_API_ATTRS int CharacterScalarCompare<char16_t>(const char16_t *x,
     const char16_t *y, std::size_t xChars, std::size_t yChars);
-template int CharacterScalarCompare<char32_t>(const char32_t *x,
+template RT_API_ATTRS int CharacterScalarCompare<char32_t>(const char32_t *x,
     const char32_t *y, std::size_t xChars, std::size_t yChars);
 
+RT_OFFLOAD_API_GROUP_END
+
 // Shift count to use when converting between character lengths
 // and byte counts.
 template <typename CHAR>
 constexpr int shift{common::TrailingZeroBitCount(sizeof(CHAR))};
 
 template <typename CHAR>
-static void Compare(Descriptor &result, const Descriptor &x,
+static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
     const Descriptor &y, const Terminator &terminator) {
   RUNTIME_CHECK(
       terminator, x.rank() == y.rank() || x.rank() == 0 || y.rank() == 0);
@@ -125,7 +130,7 @@ static void Compare(Descriptor &result, const Descriptor &x,
 }
 
 template <typename CHAR, bool ADJUSTR>
-static void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
+static RT_API_ATTRS void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
   if constexpr (ADJUSTR) {
     std::size_t j{chars}, k{chars};
     for (; k > 0 && from[k - 1] == ' '; --k) {
@@ -150,8 +155,8 @@ static void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
 }
 
 template <typename CHAR, bool ADJUSTR>
-static void AdjustLRHelper(Descriptor &result, const Descriptor &string,
-    const Terminator &terminator) {
+static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
+    const Descriptor &string, const Terminator &terminator) {
   int rank{string.rank()};
   SubscriptValue ub[maxRank], stringAt[maxRank];
   SubscriptValue elements{1};
@@ -178,7 +183,7 @@ static void AdjustLRHelper(Descriptor &result, const Descriptor &string,
 }
 
 template <bool ADJUSTR>
-void AdjustLR(Descriptor &result, const Descriptor &string,
+RT_API_ATTRS void AdjustLR(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   switch (string.raw().type) {
@@ -198,7 +203,7 @@ void AdjustLR(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR>
-inline std::size_t LenTrim(const CHAR *x, std::size_t chars) {
+inline RT_API_ATTRS std::size_t LenTrim(const CHAR *x, std::size_t chars) {
   while (chars > 0 && x[chars - 1] == ' ') {
     --chars;
   }
@@ -206,7 +211,7 @@ inline std::size_t LenTrim(const CHAR *x, std::size_t chars) {
 }
 
 template <typename INT, typename CHAR>
-static void LenTrim(Descriptor &result, const Descriptor &string,
+static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
     const Terminator &terminator) {
   int rank{string.rank()};
   SubscriptValue ub[maxRank], stringAt[maxRank];
@@ -233,8 +238,8 @@ static void LenTrim(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR>
-static void LenTrimKind(Descriptor &result, const Descriptor &string, int kind,
-    const Terminator &terminator) {
+static RT_API_ATTRS void LenTrimKind(Descriptor &result,
+    const Descriptor &string, int kind, const Terminator &terminator) {
   switch (kind) {
   case 1:
     LenTrim<CppTypeFor<TypeCategory::Integer, 1>, CHAR>(
@@ -264,8 +269,8 @@ static void LenTrimKind(Descriptor &result, const Descriptor &string, int kind,
 
 // INDEX implementation
 template <typename CHAR>
-inline std::size_t Index(const CHAR *x, std::size_t xLen, const CHAR *want,
-    std::size_t wantLen, bool back) {
+inline RT_API_ATTRS std::size_t Index(const CHAR *x, std::size_t xLen,
+    const CHAR *want, std::size_t wantLen, bool back) {
   if (xLen < wantLen) {
     return 0;
   }
@@ -330,8 +335,8 @@ inline std::size_t Index(const CHAR *x, std::size_t xLen, const CHAR *want,
 enum class CharFunc { Index, Scan, Verify };
 
 template <typename CHAR, CharFunc FUNC>
-inline std::size_t ScanVerify(const CHAR *x, std::size_t xLen, const CHAR *set,
-    std::size_t setLen, bool back) {
+inline RT_API_ATTRS std::size_t ScanVerify(const CHAR *x, std::size_t xLen,
+    const CHAR *set, std::size_t setLen, bool back) {
   std::size_t at{back ? xLen : 1};
   int increment{back ? -1 : 1};
   for (; xLen-- > 0; at += increment) {
@@ -353,8 +358,8 @@ inline std::size_t ScanVerify(const CHAR *x, std::size_t xLen, const CHAR *set,
 
 // Specialization for one-byte characters
 template <bool IS_VERIFY = false>
-inline std::size_t ScanVerify(const char *x, std::size_t xLen, const char *set,
-    std::size_t setLen, bool back) {
+inline RT_API_ATTRS std::size_t ScanVerify(const char *x, std::size_t xLen,
+    const char *set, std::size_t setLen, bool back) {
   std::size_t at{back ? xLen : 1};
   int increment{back ? -1 : 1};
   if (xLen > 0) {
@@ -376,8 +381,8 @@ inline std::size_t ScanVerify(const char *x, std::size_t xLen, const char *set,
 }
 
 template <typename INT, typename CHAR, CharFunc FUNC>
-static void GeneralCharFunc(Descriptor &result, const Descriptor &string,
-    const Descriptor &arg, const Descriptor *back,
+static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
+    const Descriptor &string, const Descriptor &arg, const Descriptor *back,
     const Terminator &terminator) {
   int rank{string.rank() ? string.rank()
           : arg.rank()   ? arg.rank()
@@ -434,9 +439,9 @@ static void GeneralCharFunc(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR, CharFunc FUNC>
-static void GeneralCharFuncKind(Descriptor &result, const Descriptor &string,
-    const Descriptor &arg, const Descriptor *back, int kind,
-    const Terminator &terminator) {
+static RT_API_ATTRS void GeneralCharFuncKind(Descriptor &result,
+    const Descriptor &string, const Descriptor &arg, const Descriptor *back,
+    int kind, const Terminator &terminator) {
   switch (kind) {
   case 1:
     GeneralCharFunc<CppTypeFor<TypeCategory::Integer, 1>, CHAR, FUNC>(
@@ -466,8 +471,8 @@ static void GeneralCharFuncKind(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR, bool ISMIN>
-static void MaxMinHelper(Descriptor &accumulator, const Descriptor &x,
-    const Terminator &terminator) {
+static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
+    const Descriptor &x, const Terminator &terminator) {
   RUNTIME_CHECK(terminator,
       accumulator.rank() == 0 || x.rank() == 0 ||
           accumulator.rank() == x.rank());
@@ -525,7 +530,7 @@ static void MaxMinHelper(Descriptor &accumulator, const Descriptor &x,
 }
 
 template <bool ISMIN>
-static void MaxMin(Descriptor &accumulator, const Descriptor &x,
+static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   RUNTIME_CHECK(terminator, accumulator.raw().type == x.raw().type);
@@ -546,8 +551,9 @@ static void MaxMin(Descriptor &accumulator, const Descriptor &x,
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
+void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
     const Descriptor &from, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   RUNTIME_CHECK(terminator,
@@ -596,7 +602,7 @@ void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
   FreeMemory(old);
 }
 
-void RTNAME(CharacterConcatenateScalar1)(
+void RTDEF(CharacterConcatenateScalar1)(
     Descriptor &accumulator, const char *from, std::size_t chars) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, accumulator.rank() == 0);
@@ -609,7 +615,7 @@ void RTNAME(CharacterConcatenateScalar1)(
   FreeMemory(old);
 }
 
-int RTNAME(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
+int RTDEF(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, x.rank() == 0);
   RUNTIME_CHECK(terminator, y.rank() == 0);
@@ -633,22 +639,22 @@ int RTNAME(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
   return 0;
 }
 
-int RTNAME(CharacterCompareScalar1)(
+int RTDEF(CharacterCompareScalar1)(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+int RTDEF(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
     std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+int RTDEF(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
     std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-void RTNAME(CharacterCompare)(
+void RTDEF(CharacterCompare)(
     Descriptor &result, const Descriptor &x, const Descriptor &y) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, x.raw().type == y.raw().type);
@@ -668,7 +674,7 @@ void RTNAME(CharacterCompare)(
   }
 }
 
-std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+std::size_t RTDEF(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
     std::size_t offset, const char *rhs, std::size_t rhsBytes) {
   if (auto n{std::min(lhsBytes - offset, rhsBytes)}) {
     std::memcpy(lhs + offset, rhs, n);
@@ -677,7 +683,7 @@ std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
   return offset;
 }
 
-void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
+void RTDEF(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
   if (bytes > offset) {
     std::memset(lhs + offset, ' ', bytes - offset);
   }
@@ -685,30 +691,30 @@ void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
 
 // Intrinsic function entry points
 
-void RTNAME(Adjustl)(Descriptor &result, const Descriptor &string,
+void RTDEF(Adjustl)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   AdjustLR<false>(result, string, sourceFile, sourceLine);
 }
 
-void RTNAME(Adjustr)(Descriptor &result, const Descriptor &string,
+void RTDEF(Adjustr)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   AdjustLR<true>(result, string, sourceFile, sourceLine);
 }
 
-std::size_t RTNAME(Index1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Index1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return Index<char>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Index2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Index2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return Index<char16_t>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Index4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Index4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return Index<char32_t>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Index)(Descriptor &result, const Descriptor &string,
+void RTDEF(Index)(Descriptor &result, const Descriptor &string,
     const Descriptor &substring, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -731,17 +737,17 @@ void RTNAME(Index)(Descriptor &result, const Descriptor &string,
   }
 }
 
-std::size_t RTNAME(LenTrim1)(const char *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim1)(const char *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
-std::size_t RTNAME(LenTrim2)(const char16_t *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim2)(const char16_t *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
-std::size_t RTNAME(LenTrim4)(const char32_t *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim4)(const char32_t *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
 
-void RTNAME(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
+void RTDEF(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   switch (string.raw().type) {
@@ -760,20 +766,20 @@ void RTNAME(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
   }
 }
 
-std::size_t RTNAME(Scan1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Scan1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return ScanVerify<char, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Scan2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Scan2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char16_t, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Scan4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Scan4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char32_t, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
+void RTDEF(Scan)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -796,7 +802,7 @@ void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
+void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
     std::int64_t ncopies, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ncopies < 0) {
@@ -815,7 +821,7 @@ void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
+void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   std::size_t resultBytes{0};
@@ -844,20 +850,20 @@ void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
   std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }
 
-std::size_t RTNAME(Verify1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Verify1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return ScanVerify<char, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Verify2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Verify2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char16_t, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Verify4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Verify4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char32_t, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
+void RTDEF(Verify)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -880,14 +886,16 @@ void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
+void RTDEF(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   MaxMin<false>(accumulator, x, sourceFile, sourceLine);
 }
 
-void RTNAME(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
+void RTDEF(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   MaxMin<true>(accumulator, x, sourceFile, sourceLine);
 }
+
+RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime
diff --git a/flang/runtime/copy.cpp b/flang/runtime/copy.cpp
index 71ef2c2f75665b..9e62d1e24a4731 100644
--- a/flang/runtime/copy.cpp
+++ b/flang/runtime/copy.cpp
@@ -14,8 +14,9 @@
 #include <cstring>
 
 namespace Fortran::runtime {
+RT_OFFLOAD_API_GROUP_BEGIN
 
-void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
+RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
     const Descriptor &from, const SubscriptValue fromAt[],
     Terminator &terminator) {
   char *toPtr{to.Element<char>(toAt)};
@@ -48,7 +49,7 @@ void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
   }
 }
 
-void CopyArray(
+RT_API_ATTRS void CopyArray(
     const Descriptor &to, const Descriptor &from, Terminator &terminator) {
   std::size_t elements{to.Elements()};
   RUNTIME_CHECK(terminator, elements == from.Elements());
@@ -61,4 +62,6 @@ void CopyArray(
     from.IncrementSubscripts(fromAt);
   }
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/derived-api.cpp b/flang/runtime/derived-api.cpp
index 0b2df206938916..321f50a1edfcfe 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang/runtime/derived-api.cpp
@@ -17,6 +17,7 @@
 namespace Fortran::runtime {
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
 void RTDEF(Initialize)(
     const Descriptor &descriptor, const char *sourceFile, int sourceLine) {
@@ -166,5 +167,6 @@ void RTDEF(DestroyWithoutFinalization)(const Descriptor &descriptor) {
   }
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp
index 58382863a50067..5e6dd541f6b1a1 100644
--- a/flang/runtime/dot-product.cpp
+++ b/flang/runtime/dot-product.cpp
@@ -21,14 +21,20 @@ namespace Fortran::runtime {
 // Beware: DOT_PRODUCT of COMPLEX data uses the complex conjugate of the first
 // argument; MATMUL does not.
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // General accumulator for any type and stride; this is not used for
 // contiguous numeric vectors.
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 class Accumulator {
 public:
   using Result = AccumulationType<RCAT, RKIND>;
-  Accumulator(const Descriptor &x, const Descriptor &y) : x_{x}, y_{y} {}
-  void AccumulateIndexed(SubscriptValue xAt, SubscriptValue yAt) {
+  RT_API_ATTRS Accumulator(const Descriptor &x, const Descriptor &y)
+      : x_{x}, y_{y} {}
+  RT_API_ATTRS void AccumulateIndexed(SubscriptValue xAt, SubscriptValue yAt) {
     if constexpr (RCAT == TypeCategory::Logical) {
       sum_ = sum_ ||
           (IsLogicalElementTrue(x_, &xAt) && IsLogicalElementTrue(y_, &yAt));
@@ -43,7 +49,7 @@ class Accumulator {
       }
     }
   }
-  Result GetResult() const { return sum_; }
+  RT_API_ATTRS Result GetResult() const { return sum_; }
 
 private:
   const Descriptor &x_, &y_;
@@ -51,7 +57,7 @@ class Accumulator {
 };
 
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
+static inline RT_API_ATTRS CppTypeFor<RCAT, RKIND> DoDotProduct(
     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
   using Result = CppTypeFor<RCAT, RKIND>;
   RUNTIME_CHECK(terminator, x.rank() == 1 && y.rank() == 1);
@@ -83,8 +89,11 @@ static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
       AccumType accum{};
       if constexpr (RCAT == TypeCategory::Complex) {
         for (SubscriptValue j{0}; j < n; ++j) {
-          accum += std::conj(static_cast<AccumType>(*xp++)) *
+          // std::conj() may instantiate its argument twice,
+          // so xp has to be incremented separately.
+          accum += std::conj(static_cast<AccumType>(*xp)) *
               static_cast<AccumType>(*yp++);
+          xp++;
         }
       } else {
         for (SubscriptValue j{0}; j < n; ++j) {
@@ -105,11 +114,13 @@ static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
   return static_cast<Result>(accumulator.GetResult());
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND> struct DotProduct {
   using Result = CppTypeFor<RCAT, RKIND>;
   template <TypeCategory XCAT, int XKIND> struct DP1 {
     template <TypeCategory YCAT, int YKIND> struct DP2 {
-      Result operator()(const Descriptor &x, const Descriptor &y,
+      RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
           Terminator &terminator) const {
         if constexpr (constexpr auto resultType{
                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
@@ -125,12 +136,12 @@ template <TypeCategory RCAT, int RKIND> struct DotProduct {
             static_cast<int>(YCAT), YKIND);
       }
     };
-    Result operator()(const Descriptor &x, const Descriptor &y,
+    RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
         Terminator &terminator, TypeCategory yCat, int yKind) const {
       return ApplyType<DP2, Result>(yCat, yKind, terminator, x, y, terminator);
     }
   };
-  Result operator()(const Descriptor &x, const Descriptor &y,
+  RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
       const char *source, int line) const {
     Terminator terminator{source, line};
     if (RCAT != TypeCategory::Logical && x.type() == y.type()) {
@@ -148,24 +159,26 @@ template <TypeCategory RCAT, int RKIND> struct DotProduct {
 };
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(DotProductInteger1)(
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(DotProductInteger1)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 1>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(DotProductInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(DotProductInteger2)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 2>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(DotProductInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(DotProductInteger4)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 4>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(DotProductInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(DotProductInteger8)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 8>{}(x, y, source, line);
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(DotProductInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(DotProductInteger16)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 16>{}(x, y, source, line);
 }
@@ -173,53 +186,55 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(DotProductInteger16)(
 
 // TODO: REAL/COMPLEX(2 & 3)
 // Intermediate results and operations are at least 64 bits
-CppTypeFor<TypeCategory::Real, 4> RTNAME(DotProductReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(DotProductReal4)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 4>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(DotProductReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(DotProductReal8)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 8>{}(x, y, source, line);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(DotProductReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(DotProductReal10)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 10>{}(x, y, source, line);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(DotProductReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(DotProductReal16)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 16>{}(x, y, source, line);
 }
 #endif
 
-void RTNAME(CppDotProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
+void RTDEF(CppDotProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 4>{}(x, y, source, line);
 }
-void RTNAME(CppDotProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
+void RTDEF(CppDotProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 8>{}(x, y, source, line);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppDotProductComplex10)(
+void RTDEF(CppDotProductComplex10)(
     CppTypeFor<TypeCategory::Complex, 10> &result, const Descriptor &x,
     const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 10>{}(x, y, source, line);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(CppDotProductComplex16)(
+void RTDEF(CppDotProductComplex16)(
     CppTypeFor<TypeCategory::Complex, 16> &result, const Descriptor &x,
     const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 16>{}(x, y, source, line);
 }
 #endif
 
-bool RTNAME(DotProductLogical)(
+bool RTDEF(DotProductLogical)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Logical, 1>{}(x, y, source, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 70b2c4d3d735a1..edb5d5f47a5acf 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -26,8 +26,8 @@ namespace Fortran::runtime {
 
 template <typename T, bool IS_MAX, bool BACK> struct NumericCompare {
   using Type = T;
-  explicit NumericCompare(std::size_t /*elemLen; ignored*/) {}
-  bool operator()(const T &value, const T &previous) const {
+  explicit RT_API_ATTRS NumericCompare(std::size_t /*elemLen; ignored*/) {}
+  RT_API_ATTRS bool operator()(const T &value, const T &previous) const {
     if (value == previous) {
       return BACK;
     } else if constexpr (IS_MAX) {
@@ -41,9 +41,9 @@ template <typename T, bool IS_MAX, bool BACK> struct NumericCompare {
 template <typename T, bool IS_MAX, bool BACK> class CharacterCompare {
 public:
   using Type = T;
-  explicit CharacterCompare(std::size_t elemLen)
+  explicit RT_API_ATTRS CharacterCompare(std::size_t elemLen)
       : chars_{elemLen / sizeof(T)} {}
-  bool operator()(const T &value, const T &previous) const {
+  RT_API_ATTRS bool operator()(const T &value, const T &previous) const {
     int cmp{CharacterScalarCompare<T>(&value, &previous, chars_, chars_)};
     if (cmp == 0) {
       return BACK;
@@ -61,19 +61,20 @@ template <typename T, bool IS_MAX, bool BACK> class CharacterCompare {
 template <typename COMPARE> class ExtremumLocAccumulator {
 public:
   using Type = typename COMPARE::Type;
-  ExtremumLocAccumulator(const Descriptor &array)
+  RT_API_ATTRS ExtremumLocAccumulator(const Descriptor &array)
       : array_{array}, argRank_{array.rank()}, compare_{array.ElementBytes()} {
     Reinitialize();
   }
-  void Reinitialize() {
+  RT_API_ATTRS void Reinitialize() {
     // per standard: result indices are all zero if no data
     for (int j{0}; j < argRank_; ++j) {
       extremumLoc_[j] = 0;
     }
     previous_ = nullptr;
   }
-  int argRank() const { return argRank_; }
-  template <typename A> void GetResult(A *p, int zeroBasedDim = -1) {
+  RT_API_ATTRS int argRank() const { return argRank_; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int zeroBasedDim = -1) {
     if (zeroBasedDim >= 0) {
       *p = extremumLoc_[zeroBasedDim] -
           array_.GetDimension(zeroBasedDim).LowerBound() + 1;
@@ -83,7 +84,8 @@ template <typename COMPARE> class ExtremumLocAccumulator {
       }
     }
   }
-  template <typename IGNORED> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename IGNORED>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     const auto &value{*array_.Element<Type>(at)};
     if (!previous_ || compare_(value, *previous_)) {
       previous_ = &value;
@@ -103,8 +105,8 @@ template <typename COMPARE> class ExtremumLocAccumulator {
 };
 
 template <typename ACCUMULATOR, typename CPPTYPE>
-static void LocationHelper(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const Descriptor *mask,
+static RT_API_ATTRS void LocationHelper(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const Descriptor *mask,
     Terminator &terminator) {
   ACCUMULATOR accumulator{x};
   DoTotalReduction<CPPTYPE>(x, 0, mask, accumulator, intrinsic, terminator);
@@ -114,9 +116,9 @@ static void LocationHelper(const char *intrinsic, Descriptor &result,
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
-inline void DoMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void DoMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   using CppType = CppTypeFor<CAT, KIND>;
   Terminator terminator{source, line};
   if (back) {
@@ -130,7 +132,7 @@ inline void DoMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 
 template <bool IS_MAX> struct CharacterMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    void operator()(const char *intrinsic, Descriptor &result,
+    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
         const Descriptor &x, int kind, const char *source, int line,
         const Descriptor *mask, bool back) const {
       DoMaxOrMinLoc<TypeCategory::Character, KIND, IS_MAX, NumericCompare>(
@@ -140,9 +142,9 @@ template <bool IS_MAX> struct CharacterMaxOrMinLocHelper {
 };
 
 template <bool IS_MAX>
-inline void CharacterMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void CharacterMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   int rank{x.rank()};
   SubscriptValue extent[1]{rank};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -169,9 +171,9 @@ inline void CharacterMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
-inline void TotalNumericMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void TotalNumericMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   int rank{x.rank()};
   SubscriptValue extent[1]{rank};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -189,125 +191,129 @@ inline void TotalNumericMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 extern "C" {
-void RTNAME(MaxlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(MaxlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   CharacterMaxOrMinLoc<true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 1, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 2, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 4, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 8, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #ifdef __SIZEOF_INT128__
-void RTNAME(MaxlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 16, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MaxlocReal4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 4, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 8, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 10, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(MaxlocReal16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 16, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MinlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   CharacterMaxOrMinLoc<false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 1, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 2, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 4, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 8, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #ifdef __SIZEOF_INT128__
-void RTNAME(MinlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 16, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MinlocReal4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 4, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 8, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 10, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(MinlocReal16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 16, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // MAXLOC/MINLOC with DIM=
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE, bool BACK>
-static void DoPartialMaxOrMinLocDirection(const char *intrinsic,
+static RT_API_ATTRS void DoPartialMaxOrMinLocDirection(const char *intrinsic,
     Descriptor &result, const Descriptor &x, int kind, int dim,
     const Descriptor *mask, Terminator &terminator) {
   using CppType = CppTypeFor<CAT, KIND>;
@@ -320,9 +326,9 @@ static void DoPartialMaxOrMinLocDirection(const char *intrinsic,
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
-inline void DoPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, int dim, const Descriptor *mask, bool back,
-    Terminator &terminator) {
+inline RT_API_ATTRS void DoPartialMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, int dim,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   if (back) {
     DoPartialMaxOrMinLocDirection<CAT, KIND, IS_MAX, COMPARE, true>(
         intrinsic, result, x, kind, dim, mask, terminator);
@@ -336,7 +342,7 @@ template <TypeCategory CAT, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
 struct DoPartialMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    void operator()(const char *intrinsic, Descriptor &result,
+    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
         const Descriptor &x, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
@@ -346,9 +352,9 @@ struct DoPartialMaxOrMinLocHelper {
 };
 
 template <bool IS_MAX>
-inline void TypedPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, int dim, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void TypedPartialMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, int dim,
+    const char *source, int line, const Descriptor *mask, bool back) {
   Terminator terminator{source, line};
   CheckIntegerKind(terminator, kind, intrinsic);
   auto catKind{x.type().GetCategoryAndKind()};
@@ -398,16 +404,20 @@ inline void TypedPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 extern "C" {
-void RTNAME(MaxlocDim)(Descriptor &result, const Descriptor &x, int kind,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(MaxlocDim)(Descriptor &result, const Descriptor &x, int kind,
     int dim, const char *source, int line, const Descriptor *mask, bool back) {
   TypedPartialMaxOrMinLoc<true>(
       "MAXLOC", result, x, kind, dim, source, line, mask, back);
 }
-void RTNAME(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
     int dim, const char *source, int line, const Descriptor *mask, bool back) {
   TypedPartialMaxOrMinLoc<false>(
       "MINLOC", result, x, kind, dim, source, line, mask, back);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // MAXVAL and MINVAL
@@ -415,7 +425,7 @@ void RTNAME(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL, typename Enable = void>
 struct MaxOrMinIdentity {
   using Type = CppTypeFor<CAT, KIND>;
-  static constexpr Type Value() {
+  static constexpr RT_API_ATTRS Type Value() {
     return IS_MAXVAL ? std::numeric_limits<Type>::lowest()
                      : std::numeric_limits<Type>::max();
   }
@@ -425,7 +435,7 @@ struct MaxOrMinIdentity {
 template <bool IS_MAXVAL>
 struct MaxOrMinIdentity<TypeCategory::Integer, 16, IS_MAXVAL> {
   using Type = CppTypeFor<TypeCategory::Integer, 16>;
-  static constexpr Type Value() {
+  static constexpr RT_API_ATTRS Type Value() {
     return IS_MAXVAL ? Type{1} << 127 : ~Type{0} >> 1;
   }
 };
@@ -444,7 +454,7 @@ struct MaxOrMinIdentity<TypeCategory::Real, 16, IS_MAXVAL,
     typename std::enable_if_t<
         std::is_same_v<CppTypeFor<TypeCategory::Real, 16>, __float128>>> {
   using Type = __float128;
-  static Type Value() {
+  static RT_API_ATTRS Type Value() {
     // Create a buffer to store binary representation of __float128 constant.
     constexpr std::size_t alignment =
         std::max(alignof(Type), alignof(std::uint64_t));
@@ -472,15 +482,16 @@ template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
 class NumericExtremumAccumulator {
 public:
   using Type = CppTypeFor<CAT, KIND>;
-  explicit NumericExtremumAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS NumericExtremumAccumulator(const Descriptor &array)
       : array_{array} {}
-  void Reinitialize() {
+  RT_API_ATTRS void Reinitialize() {
     extremum_ = MaxOrMinIdentity<CAT, KIND, IS_MAXVAL>::Value();
   }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = extremum_;
   }
-  bool Accumulate(Type x) {
+  RT_API_ATTRS bool Accumulate(Type x) {
     if constexpr (IS_MAXVAL) {
       if (x > extremum_) {
         extremum_ = x;
@@ -490,7 +501,8 @@ class NumericExtremumAccumulator {
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -500,16 +512,17 @@ class NumericExtremumAccumulator {
 };
 
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
-inline CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(const Descriptor &x,
-    const char *source, int line, int dim, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(
+    const Descriptor &x, const char *source, int line, int dim,
+    const Descriptor *mask, const char *intrinsic) {
   return GetTotalReduction<CAT, KIND>(x, source, line, dim, mask,
       NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>{x}, intrinsic);
 }
 
 template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-static void DoMaxMinNorm2(Descriptor &result, const Descriptor &x, int dim,
-    const Descriptor *mask, const char *intrinsic, Terminator &terminator) {
+static RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
+    int dim, const Descriptor *mask, const char *intrinsic,
+    Terminator &terminator) {
   using Type = CppTypeFor<CAT, KIND>;
   ACCUMULATOR accumulator{x};
   if (dim == 0 || x.rank() == 1) {
@@ -537,8 +550,8 @@ static void DoMaxMinNorm2(Descriptor &result, const Descriptor &x, int dim,
 
 template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
   template <int KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, const char *intrinsic,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, const char *intrinsic,
         Terminator &terminator) const {
       DoMaxMinNorm2<CAT, KIND,
           NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>>(
@@ -548,9 +561,9 @@ template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
 };
 
 template <bool IS_MAXVAL>
-inline void NumericMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
-    const char *source, int line, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS void NumericMaxOrMin(Descriptor &result,
+    const Descriptor &x, int dim, const char *source, int line,
+    const Descriptor *mask, const char *intrinsic) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type);
@@ -574,10 +587,11 @@ inline void NumericMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
 template <int KIND, bool IS_MAXVAL> class CharacterExtremumAccumulator {
 public:
   using Type = CppTypeFor<TypeCategory::Character, KIND>;
-  explicit CharacterExtremumAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS CharacterExtremumAccumulator(const Descriptor &array)
       : array_{array}, charLen_{array_.ElementBytes() / KIND} {}
-  void Reinitialize() { extremum_ = nullptr; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  RT_API_ATTRS void Reinitialize() { extremum_ = nullptr; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     static_assert(std::is_same_v<A, Type>);
     std::size_t byteSize{array_.ElementBytes()};
     if (extremum_) {
@@ -589,7 +603,7 @@ template <int KIND, bool IS_MAXVAL> class CharacterExtremumAccumulator {
       std::memset(p, IS_MAXVAL ? 0 : KIND == 1 ? 127 : 255, byteSize);
     }
   }
-  bool Accumulate(const Type *x) {
+  RT_API_ATTRS bool Accumulate(const Type *x) {
     if (!extremum_) {
       extremum_ = x;
     } else {
@@ -600,7 +614,8 @@ template <int KIND, bool IS_MAXVAL> class CharacterExtremumAccumulator {
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(array_.Element<A>(at));
   }
 
@@ -612,8 +627,8 @@ template <int KIND, bool IS_MAXVAL> class CharacterExtremumAccumulator {
 
 template <bool IS_MAXVAL> struct CharacterMaxOrMinHelper {
   template <int KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, const char *intrinsic,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, const char *intrinsic,
         Terminator &terminator) const {
       DoMaxMinNorm2<TypeCategory::Character, KIND,
           CharacterExtremumAccumulator<KIND, IS_MAXVAL>>(
@@ -623,9 +638,9 @@ template <bool IS_MAXVAL> struct CharacterMaxOrMinHelper {
 };
 
 template <bool IS_MAXVAL>
-inline void CharacterMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
-    const char *source, int line, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS void CharacterMaxOrMin(Descriptor &result,
+    const Descriptor &x, int dim, const char *source, int line,
+    const Descriptor *mask, const char *intrinsic) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type && type->first == TypeCategory::Character);
@@ -635,28 +650,30 @@ inline void CharacterMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
 }
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(MaxvalInteger1)(const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(MaxvalInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 1, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(MaxvalInteger2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(MaxvalInteger2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 2, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(MaxvalInteger4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(MaxvalInteger4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 4, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(MaxvalInteger8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(MaxvalInteger8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 8, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(MaxvalInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(MaxvalInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 16, true>(
@@ -665,58 +682,58 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(MaxvalInteger16)(
 #endif
 
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(MaxvalReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(MaxvalReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 4, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(MaxvalReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(MaxvalReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 8, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(MaxvalReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(MaxvalReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 10, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(MaxvalReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(MaxvalReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 16, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #endif
 
-void RTNAME(MaxvalCharacter)(Descriptor &result, const Descriptor &x,
+void RTDEF(MaxvalCharacter)(Descriptor &result, const Descriptor &x,
     const char *source, int line, const Descriptor *mask) {
   CharacterMaxOrMin<true>(result, x, 0, source, line, mask, "MAXVAL");
 }
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(MinvalInteger1)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(MinvalInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 1, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(MinvalInteger2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(MinvalInteger2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 2, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(MinvalInteger4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(MinvalInteger4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 4, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(MinvalInteger8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(MinvalInteger8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 8, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(MinvalInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(MinvalInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 16, false>(
@@ -725,37 +742,37 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(MinvalInteger16)(
 #endif
 
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(MinvalReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(MinvalReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 4, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(MinvalReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(MinvalReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 8, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(MinvalReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(MinvalReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 10, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(MinvalReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(MinvalReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 16, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #endif
 
-void RTNAME(MinvalCharacter)(Descriptor &result, const Descriptor &x,
+void RTDEF(MinvalCharacter)(Descriptor &result, const Descriptor &x,
     const char *source, int line, const Descriptor *mask) {
   CharacterMaxOrMin<false>(result, x, 0, source, line, mask, "MINVAL");
 }
 
-void RTNAME(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   if (x.type().IsCharacter()) {
     CharacterMaxOrMin<true>(result, x, dim, source, line, mask, "MAXVAL");
@@ -763,7 +780,7 @@ void RTNAME(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
     NumericMaxOrMin<true>(result, x, dim, source, line, mask, "MAXVAL");
   }
 }
-void RTNAME(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   if (x.type().IsCharacter()) {
     CharacterMaxOrMin<false>(result, x, dim, source, line, mask, "MINVAL");
@@ -771,33 +788,42 @@ void RTNAME(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
     NumericMaxOrMin<false>(result, x, dim, source, line, mask, "MINVAL");
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // NORM2
 
-template <int KIND> class Norm2Accumulator {
-public:
-  using Type = CppTypeFor<TypeCategory::Real, KIND>;
-  // Use at least double precision for accumulators.
-  // Don't use __float128, it doesn't work with abs() or sqrt() yet.
-  static constexpr int largestLDKind {
+RT_VAR_GROUP_BEGIN
+
+// Use at least double precision for accumulators.
+// Don't use __float128, it doesn't work with abs() or sqrt() yet.
+static constexpr RT_CONST_VAR_ATTRS int largestLDKind {
 #if LDBL_MANT_DIG == 113
-    16
+  16
 #elif LDBL_MANT_DIG == 64
-    10
+  10
 #else
-    8
+  8
 #endif
-  };
+};
+
+RT_VAR_GROUP_END
+
+template <int KIND> class Norm2Accumulator {
+public:
+  using Type = CppTypeFor<TypeCategory::Real, KIND>;
   using AccumType =
       CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, largestLDKind)>;
-  explicit Norm2Accumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { max_ = sum_ = 0; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     // m * sqrt(1 + sum((others(:)/m)**2))
     *p = static_cast<Type>(max_ * std::sqrt(1 + sum_));
   }
-  bool Accumulate(Type x) {
+  RT_API_ATTRS bool Accumulate(Type x) {
     auto absX{std::abs(static_cast<AccumType>(x))};
     if (!max_) {
       max_ = absX;
@@ -813,7 +839,8 @@ template <int KIND> class Norm2Accumulator {
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -824,7 +851,7 @@ template <int KIND> class Norm2Accumulator {
 };
 
 template <int KIND> struct Norm2Helper {
-  void operator()(Descriptor &result, const Descriptor &x, int dim,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
       const Descriptor *mask, Terminator &terminator) const {
     DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
         result, x, dim, mask, "NORM2", terminator);
@@ -832,33 +859,35 @@ template <int KIND> struct Norm2Helper {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Norm2_4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 4>(
       x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Norm2_8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 8>(
       x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Norm2_10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 10>(
       x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
 }
 #endif
 #if LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Norm2_16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 16>(
       x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
 }
 #endif
 
-void RTNAME(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
@@ -870,5 +899,7 @@ void RTNAME(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
     terminator.Crash("NORM2: bad type code %d", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/findloc.cpp b/flang/runtime/findloc.cpp
index 6b60e523d2a47e..674a21ae50b853 100644
--- a/flang/runtime/findloc.cpp
+++ b/flang/runtime/findloc.cpp
@@ -21,8 +21,8 @@ template <TypeCategory CAT1, int KIND1, TypeCategory CAT2, int KIND2>
 struct Equality {
   using Type1 = CppTypeFor<CAT1, KIND1>;
   using Type2 = CppTypeFor<CAT2, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return *array.Element<Type1>(at) == *target.OffsetElement<Type2>();
   }
 };
@@ -31,8 +31,8 @@ template <int KIND1, int KIND2>
 struct Equality<TypeCategory::Complex, KIND1, TypeCategory::Complex, KIND2> {
   using Type1 = CppTypeFor<TypeCategory::Complex, KIND1>;
   using Type2 = CppTypeFor<TypeCategory::Complex, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type1 &xz{*array.Element<Type1>(at)};
     const Type2 &tz{*target.OffsetElement<Type2>()};
     return xz.real() == tz.real() && xz.imag() == tz.imag();
@@ -43,8 +43,8 @@ template <int KIND1, TypeCategory CAT2, int KIND2>
 struct Equality<TypeCategory::Complex, KIND1, CAT2, KIND2> {
   using Type1 = CppTypeFor<TypeCategory::Complex, KIND1>;
   using Type2 = CppTypeFor<CAT2, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type1 &z{*array.Element<Type1>(at)};
     return z.imag() == 0 && z.real() == *target.OffsetElement<Type2>();
   }
@@ -54,8 +54,8 @@ template <TypeCategory CAT1, int KIND1, int KIND2>
 struct Equality<CAT1, KIND1, TypeCategory::Complex, KIND2> {
   using Type1 = CppTypeFor<CAT1, KIND1>;
   using Type2 = CppTypeFor<TypeCategory::Complex, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type2 &z{*target.OffsetElement<Type2>()};
     return *array.Element<Type1>(at) == z.real() && z.imag() == 0;
   }
@@ -63,8 +63,8 @@ struct Equality<CAT1, KIND1, TypeCategory::Complex, KIND2> {
 
 template <int KIND> struct CharacterEquality {
   using Type = CppTypeFor<TypeCategory::Character, KIND>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return CharacterScalarCompare<Type>(array.Element<Type>(at),
                target.OffsetElement<Type>(),
                array.ElementBytes() / static_cast<unsigned>(KIND),
@@ -73,8 +73,8 @@ template <int KIND> struct CharacterEquality {
 };
 
 struct LogicalEquivalence {
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return IsLogicalElementTrue(array, at) ==
         IsLogicalElementTrue(target, at /*ignored*/);
   }
@@ -82,11 +82,12 @@ struct LogicalEquivalence {
 
 template <typename EQUALITY> class LocationAccumulator {
 public:
-  LocationAccumulator(
+  RT_API_ATTRS LocationAccumulator(
       const Descriptor &array, const Descriptor &target, bool back)
       : array_{array}, target_{target}, back_{back} {}
-  void Reinitialize() { gotAnything_ = false; }
-  template <typename A> void GetResult(A *p, int zeroBasedDim = -1) {
+  RT_API_ATTRS void Reinitialize() { gotAnything_ = false; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int zeroBasedDim = -1) {
     if (zeroBasedDim >= 0) {
       *p = gotAnything_ ? location_[zeroBasedDim] -
               array_.GetDimension(zeroBasedDim).LowerBound() + 1
@@ -102,7 +103,8 @@ template <typename EQUALITY> class LocationAccumulator {
       }
     }
   }
-  template <typename IGNORED> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename IGNORED>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     if (equality_(array_, at, target_)) {
       gotAnything_ = true;
       for (int j{0}; j < rank_; ++j) {
@@ -127,7 +129,7 @@ template <typename EQUALITY> class LocationAccumulator {
 template <TypeCategory XCAT, int XKIND, TypeCategory TARGET_CAT>
 struct TotalNumericFindlocHelper {
   template <int TARGET_KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
         const Descriptor &target, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       using Eq = Equality<XCAT, XKIND, TARGET_CAT, TARGET_KIND>;
@@ -145,9 +147,10 @@ template <TypeCategory CAT,
     class HELPER>
 struct NumericFindlocHelper {
   template <int KIND> struct Functor {
-    void operator()(TypeCategory targetCat, int targetKind, Descriptor &result,
-        const Descriptor &x, const Descriptor &target, int kind, int dim,
-        const Descriptor *mask, bool back, Terminator &terminator) const {
+    RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
+        Descriptor &result, const Descriptor &x, const Descriptor &target,
+        int kind, int dim, const Descriptor *mask, bool back,
+        Terminator &terminator) const {
       switch (targetCat) {
       case TypeCategory::Integer:
         ApplyIntegerKind<
@@ -177,7 +180,7 @@ struct NumericFindlocHelper {
 };
 
 template <int KIND> struct CharacterFindlocHelper {
-  void operator()(Descriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
       const Descriptor &target, int kind, const Descriptor *mask, bool back,
       Terminator &terminator) {
     using Accumulator = LocationAccumulator<CharacterEquality<KIND>>;
@@ -188,9 +191,9 @@ template <int KIND> struct CharacterFindlocHelper {
   }
 };
 
-static void LogicalFindlocHelper(Descriptor &result, const Descriptor &x,
-    const Descriptor &target, int kind, const Descriptor *mask, bool back,
-    Terminator &terminator) {
+static RT_API_ATTRS void LogicalFindlocHelper(Descriptor &result,
+    const Descriptor &x, const Descriptor &target, int kind,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   using Accumulator = LocationAccumulator<LogicalEquivalence>;
   Accumulator accumulator{x, target, back};
   DoTotalReduction<void>(x, 0, mask, accumulator, "FINDLOC", terminator);
@@ -199,7 +202,9 @@ static void LogicalFindlocHelper(Descriptor &result, const Descriptor &x,
 }
 
 extern "C" {
-void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(Findloc)(Descriptor &result, const Descriptor &x,
     const Descriptor &target, int kind, const char *source, int line,
     const Descriptor *mask, bool back) {
   int rank{x.rank()};
@@ -251,6 +256,8 @@ void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
         "FINDLOC: bad data type code (%d) for array", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // FINDLOC with DIM=
@@ -258,7 +265,7 @@ void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
 template <TypeCategory XCAT, int XKIND, TypeCategory TARGET_CAT>
 struct PartialNumericFindlocHelper {
   template <int TARGET_KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
         const Descriptor &target, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       using Eq = Equality<XCAT, XKIND, TARGET_CAT, TARGET_KIND>;
@@ -272,7 +279,7 @@ struct PartialNumericFindlocHelper {
 };
 
 template <int KIND> struct PartialCharacterFindlocHelper {
-  void operator()(Descriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
       const Descriptor &target, int kind, int dim, const Descriptor *mask,
       bool back, Terminator &terminator) {
     using Accumulator = LocationAccumulator<CharacterEquality<KIND>>;
@@ -283,9 +290,9 @@ template <int KIND> struct PartialCharacterFindlocHelper {
   }
 };
 
-static void PartialLogicalFindlocHelper(Descriptor &result, const Descriptor &x,
-    const Descriptor &target, int kind, int dim, const Descriptor *mask,
-    bool back, Terminator &terminator) {
+static RT_API_ATTRS void PartialLogicalFindlocHelper(Descriptor &result,
+    const Descriptor &x, const Descriptor &target, int kind, int dim,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   using Accumulator = LocationAccumulator<LogicalEquivalence>;
   Accumulator accumulator{x, target, back};
   ApplyIntegerKind<PartialLocationHelper<Accumulator>::template Functor, void>(
@@ -294,7 +301,9 @@ static void PartialLogicalFindlocHelper(Descriptor &result, const Descriptor &x,
 }
 
 extern "C" {
-void RTNAME(FindlocDim)(Descriptor &result, const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(FindlocDim)(Descriptor &result, const Descriptor &x,
     const Descriptor &target, int kind, int dim, const char *source, int line,
     const Descriptor *mask, bool back) {
   Terminator terminator{source, line};
@@ -338,5 +347,7 @@ void RTNAME(FindlocDim)(Descriptor &result, const Descriptor &x,
         "FINDLOC: bad data type code (%d) for array", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 28248f76e882af..757c6d07e5711c 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -37,6 +37,16 @@
 #define STD_MEMCMP_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_REALLOC_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_REALLOC_UNSUPPORTED 1
+#endif
+
+#if !defined(STD_CALLOC_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_CALLOC_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -118,5 +128,30 @@ static inline RT_API_ATTRS int memcmp(
 using std::memcmp;
 #endif // !STD_MEMCMP_UNSUPPORTED
 
+#if STD_REALLOC_UNSUPPORTED
+static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t new_size) {
+  // Return nullptr and let the callers assert that.
+  // TODO: we can provide a straightforward implementation
+  // via malloc/memcpy/free.
+  return nullptr;
+}
+#else // !STD_REALLOC_UNSUPPORTED
+using std::realloc;
+#endif // !STD_REALLOC_UNSUPPORTED
+
+#if STD_CALLOC_UNSUPPORTED
+static inline RT_API_ATTRS void *calloc(std::size_t num, std::size_t size) {
+  std::size_t bytes{num * size};
+  void *ptr{nullptr};
+  if (bytes != 0) {
+    ptr = std::malloc(bytes);
+    ptr = std::memset(ptr, 0, bytes);
+  }
+  return ptr;
+}
+#else // !STD_CALLOC_UNSUPPORTED
+using std::calloc;
+#endif // !STD_CALLOC_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/inquiry.cpp b/flang/runtime/inquiry.cpp
index 5dc692c9a384a7..2b59a1cfab1a9b 100644
--- a/flang/runtime/inquiry.cpp
+++ b/flang/runtime/inquiry.cpp
@@ -19,7 +19,7 @@
 namespace Fortran::runtime {
 
 extern "C" {
-std::int64_t RTNAME(LboundDim)(
+std::int64_t RTDEF(LboundDim)(
     const Descriptor &array, int dim, const char *sourceFile, int line) {
   if (dim < 1 || dim > array.rank()) {
     Terminator terminator{sourceFile, line};
@@ -30,7 +30,7 @@ std::int64_t RTNAME(LboundDim)(
   return static_cast<std::int64_t>(dimension.LowerBound());
 }
 
-void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
+void RTDEF(Ubound)(Descriptor &result, const Descriptor &array, int kind,
     const char *sourceFile, int line) {
   SubscriptValue extent[1]{array.rank()};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -55,7 +55,7 @@ void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
   }
 }
 
-std::int64_t RTNAME(Size)(
+std::int64_t RTDEF(Size)(
     const Descriptor &array, const char *sourceFile, int line) {
   std::int64_t result{1};
   for (int i = 0; i < array.rank(); ++i) {
@@ -65,7 +65,7 @@ std::int64_t RTNAME(Size)(
   return result;
 }
 
-std::int64_t RTNAME(SizeDim)(
+std::int64_t RTDEF(SizeDim)(
     const Descriptor &array, int dim, const char *sourceFile, int line) {
   if (dim < 1 || dim > array.rank()) {
     Terminator terminator{sourceFile, line};
diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp
index 3d745575f25872..ee5fcd842b0258 100644
--- a/flang/runtime/matmul-transpose.cpp
+++ b/flang/runtime/matmul-transpose.cpp
@@ -387,6 +387,8 @@ template <bool IS_ALLOCATING> struct MatmulTranspose {
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 void RTDEF(MatmulTranspose)(Descriptor &result, const Descriptor &x,
     const Descriptor &y, const char *sourceFile, int line) {
   MatmulTranspose<true>{}(result, x, y, sourceFile, line);
@@ -395,5 +397,7 @@ void RTDEF(MatmulTransposeDirect)(const Descriptor &result, const Descriptor &x,
     const Descriptor &y, const char *sourceFile, int line) {
   MatmulTranspose<false>{}(result, x, y, sourceFile, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp
index f06740a24b2f9a..e4595db779260f 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang/runtime/matmul.cpp
@@ -469,6 +469,8 @@ template <bool IS_ALLOCATING> struct Matmul {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 void RTDEF(Matmul)(Descriptor &result, const Descriptor &x, const Descriptor &y,
     const char *sourceFile, int line) {
   Matmul<true>{}(result, x, y, sourceFile, line);
@@ -477,5 +479,7 @@ void RTDEF(MatmulDirect)(const Descriptor &result, const Descriptor &x,
     const Descriptor &y, const char *sourceFile, int line) {
   Matmul<false>{}(result, x, y, sourceFile, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/memory.cpp b/flang/runtime/memory.cpp
index 5ed737905a9cb0..3f9b706f3dff67 100644
--- a/flang/runtime/memory.cpp
+++ b/flang/runtime/memory.cpp
@@ -11,8 +11,10 @@
 #include <cstdlib>
 
 namespace Fortran::runtime {
+RT_OFFLOAD_VAR_GROUP_BEGIN
 
-void *AllocateMemoryOrCrash(const Terminator &terminator, std::size_t bytes) {
+RT_API_ATTRS void *AllocateMemoryOrCrash(
+    const Terminator &terminator, std::size_t bytes) {
   if (void *p{std::malloc(bytes)}) {
     return p;
   }
@@ -24,5 +26,7 @@ void *AllocateMemoryOrCrash(const Terminator &terminator, std::size_t bytes) {
   return nullptr;
 }
 
-void FreeMemory(void *p) { std::free(p); }
+RT_API_ATTRS void FreeMemory(void *p) { std::free(p); }
+
+RT_OFFLOAD_VAR_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp
index 19eb9351d47fc7..56f2028c2ff02c 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang/runtime/misc-intrinsic.cpp
@@ -8,6 +8,7 @@
 
 #include "flang/Runtime/misc-intrinsic.h"
 #include "terminator.h"
+#include "tools.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstring>
@@ -15,9 +16,9 @@
 
 namespace Fortran::runtime {
 
-static void TransferImpl(Descriptor &result, const Descriptor &source,
-    const Descriptor &mold, const char *sourceFile, int line,
-    std::optional<std::int64_t> resultExtent) {
+static RT_API_ATTRS void TransferImpl(Descriptor &result,
+    const Descriptor &source, const Descriptor &mold, const char *sourceFile,
+    int line, std::optional<std::int64_t> resultExtent) {
   int rank{resultExtent.has_value() ? 1 : 0};
   std::size_t elementBytes{mold.ElementBytes()};
   result.Establish(mold.type(), elementBytes, nullptr, rank, nullptr,
@@ -52,8 +53,9 @@ static void TransferImpl(Descriptor &result, const Descriptor &source,
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
+void RTDEF(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line) {
   std::optional<std::int64_t> elements;
   if (mold.rank() > 0) {
@@ -67,18 +69,19 @@ void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
                                            "when SOURCE= is not zero-sized");
       }
     } else {
-      elements = 0;
+      elements = std::int64_t{0};
     }
   }
   return TransferImpl(
       result, source, mold, sourceFile, line, std::move(elements));
 }
 
-void RTNAME(TransferSize)(Descriptor &result, const Descriptor &source,
+void RTDEF(TransferSize)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line,
     std::int64_t size) {
   return TransferImpl(result, source, mold, sourceFile, line, size);
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 38835c2b753cef..6cbf00e0c36c78 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -302,6 +302,7 @@ RT_API_ATTRS BTy FPowI(BTy base, ETy exp) {
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
 CppTypeFor<TypeCategory::Integer, 1> RTDEF(Ceiling4_1)(
     CppTypeFor<TypeCategory::Real, 4> x) {
@@ -967,5 +968,7 @@ CppTypeFor<TypeCategory::Real, 16> RTDEF(FPow16k)(
   return FPowI(b, e);
 }
 #endif
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index b0003add7b3585..f83c00089813eb 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -16,8 +16,9 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
+void RTDEF(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
     int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(TypeCode{category, kind},
@@ -25,20 +26,20 @@ void RTNAME(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
       CFI_attribute_pointer);
 }
 
-void RTNAME(PointerNullifyCharacter)(Descriptor &pointer, SubscriptValue length,
+void RTDEF(PointerNullifyCharacter)(Descriptor &pointer, SubscriptValue length,
     int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(
       kind, length, nullptr, rank, nullptr, CFI_attribute_pointer);
 }
 
-void RTNAME(PointerNullifyDerived)(Descriptor &pointer,
+void RTDEF(PointerNullifyDerived)(Descriptor &pointer,
     const typeInfo::DerivedType &derivedType, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(derivedType, nullptr, rank, nullptr, CFI_attribute_pointer);
 }
 
-void RTNAME(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
+void RTDEF(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
     SubscriptValue lower, SubscriptValue upper) {
   INTERNAL_CHECK(zeroBasedDim >= 0 && zeroBasedDim < pointer.rank());
   pointer.GetDimension(zeroBasedDim).SetBounds(lower, upper);
@@ -47,28 +48,28 @@ void RTNAME(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
 
 // TODO: PointerSetCoBounds
 
-void RTNAME(PointerSetDerivedLength)(
+void RTDEF(PointerSetDerivedLength)(
     Descriptor &pointer, int which, SubscriptValue x) {
   DescriptorAddendum *addendum{pointer.Addendum()};
   INTERNAL_CHECK(addendum != nullptr);
   addendum->SetLenParameterValue(which, x);
 }
 
-void RTNAME(PointerApplyMold)(
+void RTDEF(PointerApplyMold)(
     Descriptor &pointer, const Descriptor &mold, int rank) {
   pointer.ApplyMold(mold, rank);
 }
 
-void RTNAME(PointerAssociateScalar)(Descriptor &pointer, void *target) {
+void RTDEF(PointerAssociateScalar)(Descriptor &pointer, void *target) {
   pointer.set_base_addr(target);
 }
 
-void RTNAME(PointerAssociate)(Descriptor &pointer, const Descriptor &target) {
+void RTDEF(PointerAssociate)(Descriptor &pointer, const Descriptor &target) {
   pointer = target;
   pointer.raw().attribute = CFI_attribute_pointer;
 }
 
-void RTNAME(PointerAssociateLowerBounds)(Descriptor &pointer,
+void RTDEF(PointerAssociateLowerBounds)(Descriptor &pointer,
     const Descriptor &target, const Descriptor &lowerBounds) {
   pointer = target;
   pointer.raw().attribute = CFI_attribute_pointer;
@@ -84,7 +85,7 @@ void RTNAME(PointerAssociateLowerBounds)(Descriptor &pointer,
   }
 }
 
-void RTNAME(PointerAssociateRemapping)(Descriptor &pointer,
+void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
     const Descriptor &target, const Descriptor &bounds, const char *sourceFile,
     int sourceLine) {
   pointer = target;
@@ -122,7 +123,7 @@ void RTNAME(PointerAssociateRemapping)(Descriptor &pointer,
   }
 }
 
-int RTNAME(PointerAllocate)(Descriptor &pointer, bool hasStat,
+int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
@@ -141,7 +142,7 @@ int RTNAME(PointerAllocate)(Descriptor &pointer, bool hasStat,
   return stat;
 }
 
-int RTNAME(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
+int RTDEF(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
     bool hasStat, const Descriptor *errMsg, const char *sourceFile,
     int sourceLine) {
   int stat{RTNAME(PointerAllocate)(
@@ -153,7 +154,7 @@ int RTNAME(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
   return stat;
 }
 
-int RTNAME(PointerDeallocate)(Descriptor &pointer, bool hasStat,
+int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
@@ -167,7 +168,7 @@ int RTNAME(PointerDeallocate)(Descriptor &pointer, bool hasStat,
       errMsg, hasStat);
 }
 
-int RTNAME(PointerDeallocatePolymorphic)(Descriptor &pointer,
+int RTDEF(PointerDeallocatePolymorphic)(Descriptor &pointer,
     const typeInfo::DerivedType *derivedType, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(PointerDeallocate)(
@@ -187,11 +188,11 @@ int RTNAME(PointerDeallocatePolymorphic)(Descriptor &pointer,
   return stat;
 }
 
-bool RTNAME(PointerIsAssociated)(const Descriptor &pointer) {
+bool RTDEF(PointerIsAssociated)(const Descriptor &pointer) {
   return pointer.raw().base_addr != nullptr;
 }
 
-bool RTNAME(PointerIsAssociatedWith)(
+bool RTDEF(PointerIsAssociatedWith)(
     const Descriptor &pointer, const Descriptor *target) {
   if (!target) {
     return pointer.raw().base_addr != nullptr;
@@ -220,5 +221,6 @@ bool RTNAME(PointerIsAssociatedWith)(
 
 // TODO: PointerCheckLengthParameter
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index 683cb61fe9951a..a516bc51a959b7 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -18,13 +18,15 @@
 namespace Fortran::runtime {
 template <typename INTERMEDIATE> class NonComplexProductAccumulator {
 public:
-  explicit NonComplexProductAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS NonComplexProductAccumulator(const Descriptor &array)
       : array_{array} {}
-  void Reinitialize() { product_ = 1; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  RT_API_ATTRS void Reinitialize() { product_ = 1; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(product_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     product_ *= *array_.Element<A>(at);
     return product_ != 0;
   }
@@ -34,16 +36,24 @@ template <typename INTERMEDIATE> class NonComplexProductAccumulator {
   INTERMEDIATE product_{1};
 };
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 template <typename PART> class ComplexProductAccumulator {
 public:
-  explicit ComplexProductAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { product_ = std::complex<PART>{1, 0}; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS ComplexProductAccumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { product_ = std::complex<PART>{1, 0}; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     using ResultPart = typename A::value_type;
     *p = {static_cast<ResultPart>(product_.real()),
         static_cast<ResultPart>(product_.imag())};
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     product_ *= *array_.Element<A>(at);
     return true;
   }
@@ -53,37 +63,37 @@ template <typename PART> class ComplexProductAccumulator {
   std::complex<PART> product_{1, 0};
 };
 
+RT_DIAG_POP
+
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ProductInteger1)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(ProductInteger1)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ProductInteger2)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(ProductInteger2)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ProductInteger4)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(ProductInteger4)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ProductInteger8)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(ProductInteger8)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x},
       "PRODUCT");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ProductInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(ProductInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
@@ -94,27 +104,27 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(ProductInteger16)(
 #endif
 
 // TODO: real/complex(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ProductReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ProductReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ProductReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ProductReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ProductReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ProductReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 16>>{x},
@@ -122,14 +132,14 @@ CppTypeFor<TypeCategory::Real, 16> RTNAME(ProductReal16)(const Descriptor &x,
 }
 #endif
 
-void RTNAME(CppProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
+void RTDEF(CppProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 4>(x, source, line, dim,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
-void RTNAME(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
+void RTDEF(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 8>(x, source, line, dim,
@@ -137,7 +147,7 @@ void RTNAME(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
       "PRODUCT");
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
+void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 10>(x, source, line, dim,
@@ -145,7 +155,7 @@ void RTNAME(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       "PRODUCT");
 }
 #elif LDBL_MANT_DIG == 113
-void RTNAME(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
+void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 16>(x, source, line, dim,
@@ -154,11 +164,13 @@ void RTNAME(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
 }
 #endif
 
-void RTNAME(ProductDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(ProductDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   TypedPartialNumericReduction<NonComplexProductAccumulator,
       NonComplexProductAccumulator, ComplexProductAccumulator>(
       result, x, dim, source, line, mask, "PRODUCT");
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/ragged.cpp b/flang/runtime/ragged.cpp
index 855aa02e7f5963..68ff59d9eda47c 100644
--- a/flang/runtime/ragged.cpp
+++ b/flang/runtime/ragged.cpp
@@ -7,20 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/ragged.h"
+#include "tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
 
-inline bool isIndirection(const RaggedArrayHeader *const header) {
+inline RT_API_ATTRS bool isIndirection(const RaggedArrayHeader *const header) {
   return header->flags & 1;
 }
 
-inline std::size_t rank(const RaggedArrayHeader *const header) {
+inline RT_API_ATTRS std::size_t rank(const RaggedArrayHeader *const header) {
   return header->flags >> 1;
 }
 
-RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
-    std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector) {
+RT_API_ATTRS RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header,
+    bool isHeader, std::int64_t rank, std::int64_t elementSize,
+    std::int64_t *extentVector) {
   if (header && rank) {
     std::int64_t size{1};
     for (std::int64_t counter{0}; counter < rank; ++counter) {
@@ -32,10 +34,11 @@ RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
     header->flags = (rank << 1) | isHeader;
     header->extentPointer = extentVector;
     if (isHeader) {
-      header->bufferPointer = std::calloc(sizeof(RaggedArrayHeader), size);
+      header->bufferPointer =
+          Fortran::runtime::calloc(sizeof(RaggedArrayHeader), size);
     } else {
       header->bufferPointer =
-          static_cast<void *>(std::calloc(elementSize, size));
+          static_cast<void *>(Fortran::runtime::calloc(elementSize, size));
     }
     return header;
   } else {
@@ -44,7 +47,7 @@ RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
 }
 
 // Deallocate a ragged array from the heap.
-void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
+RT_API_ATTRS void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
   if (raggedArrayHeader) {
     if (std::size_t end{rank(raggedArrayHeader)}) {
       if (isIndirection(raggedArrayHeader)) {
@@ -66,14 +69,14 @@ void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
 }
 
 extern "C" {
-void *RTNAME(RaggedArrayAllocate)(void *header, bool isHeader,
-    std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector) {
+void *RTDEF(RaggedArrayAllocate)(void *header, bool isHeader, std::int64_t rank,
+    std::int64_t elementSize, std::int64_t *extentVector) {
   auto *result = RaggedArrayAllocate(static_cast<RaggedArrayHeader *>(header),
       isHeader, rank, elementSize, extentVector);
   return static_cast<void *>(result);
 }
 
-void RTNAME(RaggedArrayDeallocate)(void *raggedArrayHeader) {
+void RTDEF(RaggedArrayDeallocate)(void *raggedArrayHeader) {
   RaggedArrayDeallocate(static_cast<RaggedArrayHeader *>(raggedArrayHeader));
 }
 } // extern "C"
diff --git a/flang/runtime/reduction.cpp b/flang/runtime/reduction.cpp
index 63e309ef86ac45..074a270cb50838 100644
--- a/flang/runtime/reduction.cpp
+++ b/flang/runtime/reduction.cpp
@@ -336,6 +336,7 @@ template <int KIND> struct CountDimension {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
 bool RTDEF(All)(const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalLogicalReduction(x, source, line, dim,
@@ -383,5 +384,6 @@ void RTDEF(ParityDim)(Descriptor &result, const Descriptor &x, int dim,
       result, x, dim, terminator, "PARITY");
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 5d025a4b33d6f7..048399737c8501 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -99,6 +99,8 @@ template <typename PART> class ComplexSumAccumulator {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 CppTypeFor<TypeCategory::Integer, 1> RTDEF(SumInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
@@ -187,5 +189,7 @@ void RTDEF(SumDim)(Descriptor &result, const Descriptor &x, int dim,
   TypedPartialNumericReduction<IntegerSumAccumulator, RealSumAccumulator,
       ComplexSumAccumulator>(result, x, dim, source, line, mask, "SUM");
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/support.cpp b/flang/runtime/support.cpp
index 08aa8014f4f733..12135804f00e69 100644
--- a/flang/runtime/support.cpp
+++ b/flang/runtime/support.cpp
@@ -11,10 +11,12 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
 bool RTDEF(IsContiguous)(const Descriptor &descriptor) {
   return descriptor.IsContiguous();
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime

>From 7b8f790566d3171a23b87cda3f78b3e53684785a Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Thu, 28 Dec 2023 11:07:57 -0800
Subject: [PATCH 2/3] Addressed review comments.

---
 flang/include/flang/Runtime/memory.h |  7 +++++--
 flang/runtime/array-constructor.cpp  |  4 ++--
 flang/runtime/dot-product.cpp        |  3 +++
 flang/runtime/freestanding-tools.h   | 21 +--------------------
 flang/runtime/memory.cpp             | 14 ++++++++++++++
 flang/runtime/ragged.cpp             | 12 +++++++-----
 6 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/flang/include/flang/Runtime/memory.h b/flang/include/flang/Runtime/memory.h
index bde056f439a5ee..e24c509f4e90cb 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang/include/flang/Runtime/memory.h
@@ -23,14 +23,17 @@ class Terminator;
 
 [[nodiscard]] RT_API_ATTRS void *AllocateMemoryOrCrash(
     const Terminator &, std::size_t bytes);
-template <typename A> [[nodiscard]] A &AllocateOrCrash(const Terminator &t) {
+template <typename A>
+[[nodiscard]] RT_API_ATTRS A &AllocateOrCrash(const Terminator &t) {
   return *reinterpret_cast<A *>(AllocateMemoryOrCrash(t, sizeof(A)));
 }
+RT_API_ATTRS void *ReallocateMemoryOrCrash(
+    const Terminator &, void *ptr, std::size_t newByteSize);
 RT_API_ATTRS void FreeMemory(void *);
 template <typename A> RT_API_ATTRS void FreeMemory(A *p) {
   FreeMemory(reinterpret_cast<void *>(p));
 }
-template <typename A> void FreeMemoryAndNullify(A *&p) {
+template <typename A> RT_API_ATTRS void FreeMemoryAndNullify(A *&p) {
   FreeMemory(p);
   p = nullptr;
 }
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 3905c70460987f..72e08feff7fd10 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -74,8 +74,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
       // realloc is undefined with zero new size and ElementBytes() may be null
       // if the character length is null, or if "from" is a zero sized array.
       if (newByteSize > 0) {
-        void *p{Fortran::runtime::realloc(to.raw().base_addr, newByteSize)};
-        RUNTIME_CHECK(terminator, p);
+        void *p{ReallocateMemoryOrCrash(
+            terminator, to.raw().base_addr, newByteSize)};
         to.set_base_addr(p);
       }
       vector.actualAllocationSize = requestedAllocationSize;
diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp
index 5e6dd541f6b1a1..977698269bcb46 100644
--- a/flang/runtime/dot-product.cpp
+++ b/flang/runtime/dot-product.cpp
@@ -91,6 +91,9 @@ static inline RT_API_ATTRS CppTypeFor<RCAT, RKIND> DoDotProduct(
         for (SubscriptValue j{0}; j < n; ++j) {
           // std::conj() may instantiate its argument twice,
           // so xp has to be incremented separately.
+          // This is a workaround for an alleged bug in clang,
+          // that shows up as:
+          //   warning: multiple unsequenced modifications to 'xp'
           accum += std::conj(static_cast<AccumType>(*xp)) *
               static_cast<AccumType>(*yp++);
           xp++;
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 757c6d07e5711c..bdc11ae93ac909 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -42,11 +42,6 @@
 #define STD_REALLOC_UNSUPPORTED 1
 #endif
 
-#if !defined(STD_CALLOC_UNSUPPORTED) && \
-    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
-#define STD_CALLOC_UNSUPPORTED 1
-#endif
-
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -129,7 +124,7 @@ using std::memcmp;
 #endif // !STD_MEMCMP_UNSUPPORTED
 
 #if STD_REALLOC_UNSUPPORTED
-static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t new_size) {
+static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t newByteSize) {
   // Return nullptr and let the callers assert that.
   // TODO: we can provide a straightforward implementation
   // via malloc/memcpy/free.
@@ -139,19 +134,5 @@ static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t new_size) {
 using std::realloc;
 #endif // !STD_REALLOC_UNSUPPORTED
 
-#if STD_CALLOC_UNSUPPORTED
-static inline RT_API_ATTRS void *calloc(std::size_t num, std::size_t size) {
-  std::size_t bytes{num * size};
-  void *ptr{nullptr};
-  if (bytes != 0) {
-    ptr = std::malloc(bytes);
-    ptr = std::memset(ptr, 0, bytes);
-  }
-  return ptr;
-}
-#else // !STD_CALLOC_UNSUPPORTED
-using std::calloc;
-#endif // !STD_CALLOC_UNSUPPORTED
-
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/memory.cpp b/flang/runtime/memory.cpp
index 3f9b706f3dff67..aa6ff9723d1a80 100644
--- a/flang/runtime/memory.cpp
+++ b/flang/runtime/memory.cpp
@@ -8,6 +8,7 @@
 
 #include "flang/Runtime/memory.h"
 #include "terminator.h"
+#include "tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
@@ -26,6 +27,19 @@ RT_API_ATTRS void *AllocateMemoryOrCrash(
   return nullptr;
 }
 
+RT_API_ATTRS void *ReallocateMemoryOrCrash(
+    const Terminator &terminator, void *ptr, std::size_t newByteSize) {
+  if (void *p{Fortran::runtime::realloc(ptr, newByteSize)}) {
+    return p;
+  }
+  if (newByteSize > 0) {
+    terminator.Crash("Fortran runtime internal error: memory realloc returned "
+                     "null, needed %zd bytes",
+        newByteSize);
+  }
+  return nullptr;
+}
+
 RT_API_ATTRS void FreeMemory(void *p) { std::free(p); }
 
 RT_OFFLOAD_VAR_GROUP_END
diff --git a/flang/runtime/ragged.cpp b/flang/runtime/ragged.cpp
index 68ff59d9eda47c..a4d9e541ba5311 100644
--- a/flang/runtime/ragged.cpp
+++ b/flang/runtime/ragged.cpp
@@ -34,11 +34,13 @@ RT_API_ATTRS RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header,
     header->flags = (rank << 1) | isHeader;
     header->extentPointer = extentVector;
     if (isHeader) {
-      header->bufferPointer =
-          Fortran::runtime::calloc(sizeof(RaggedArrayHeader), size);
-    } else {
-      header->bufferPointer =
-          static_cast<void *>(Fortran::runtime::calloc(elementSize, size));
+      elementSize = sizeof(RaggedArrayHeader);
+    }
+    Terminator terminator{__FILE__, __LINE__};
+    std::size_t bytes{static_cast<std::size_t>(elementSize * size)};
+    header->bufferPointer = AllocateMemoryOrCrash(terminator, bytes);
+    if (header->bufferPointer) {
+      std::memset(header->bufferPointer, 0, bytes);
     }
     return header;
   } else {

>From b40e18a589b5b67cb564251327851e7746cffe0a Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Thu, 28 Dec 2023 13:34:45 -0800
Subject: [PATCH 3/3] Fix for Windows build.

---
 flang/runtime/tools.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index ff05e76c8bb7b2..d69079e43701d6 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -108,8 +108,8 @@ static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
   case 16: {
     using Int128 = CppTypeFor<TypeCategory::Integer, 16>;
     auto n{*reinterpret_cast<const Int128 *>(p)};
-    std::int64_t result = n;
-    if (result == n) {
+    std::int64_t result{static_cast<std::int64_t>(n)};
+    if (static_cast<Int128>(result) == n) {
       return result;
     }
     return std::nullopt;



More information about the flang-commits mailing list