[flang-commits] [flang] [flang][runtime] Support for offload build of FortranDecimal. (PR #87653)

Fri Apr 5 08:30:07 PDT 2024

https://github.com/vzakhari updated https://github.com/llvm/llvm-project/pull/87653

>From be018cca7369f021a167fb61d1fc62b2046f2c54 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 3 Apr 2024 18:11:58 -0700
Subject: [PATCH 1/3] [NFC][flang][runtime] Move CMake code for the offload
 builds into utility.

---
 .../modules/AddFlangOffloadRuntime.cmake      | 132 ++++++++++++++++++
 flang/runtime/CMakeLists.txt                  | 129 +----------------
 2 files changed, 135 insertions(+), 126 deletions(-)
 create mode 100644 flang/cmake/modules/AddFlangOffloadRuntime.cmake

diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
new file mode 100644
index 00000000000000..6fb6213e90fc49
--- /dev/null
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -0,0 +1,132 @@
+option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
+  "Compile Fortran runtime as CUDA sources (experimental)" OFF
+  )
+
+set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
+
+set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
+  "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'")
+
+set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING
+  "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')")
+
+macro(enable_cuda_compilation files)
+  if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
+    if (BUILD_SHARED_LIBS)
+      message(FATAL_ERROR
+        "BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime"
+        )
+    endif()
+
+    enable_language(CUDA)
+
+    # TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION
+    # work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION.
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+    # Treat all supported sources as CUDA files.
+    set_source_files_properties(${files} PROPERTIES LANGUAGE CUDA)
+    set(CUDA_COMPILE_OPTIONS)
+    if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang")
+      # Allow varargs.
+      set(CUDA_COMPILE_OPTIONS
+        -Xclang -fcuda-allow-variadic-functions
+        )
+    endif()
+    if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
+      set(CUDA_COMPILE_OPTIONS
+        --expt-relaxed-constexpr
+        # Disable these warnings:
+        #   'long double' is treated as 'double' in device code
+        -Xcudafe --diag_suppress=20208
+        -Xcudafe --display_error_number
+        )
+    endif()
+    set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+      "${CUDA_COMPILE_OPTIONS}"
+      )
+
+    if (EXISTS "${FLANG_LIBCUDACXX_PATH}/include")
+      # When using libcudacxx headers files, we have to use them
+      # for all files of F18 runtime.
+      include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include)
+      add_compile_definitions(RT_USE_LIBCUDACXX=1)
+    endif()
+  endif()
+endmacro()
+
+macro(enable_omp_offload_compilation files)
+  if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
+    # 'host_device' build only works with Clang compiler currently.
+    # The build is done with the CMAKE_C/CXX_COMPILER, i.e. it does not use
+    # the in-tree built Clang. We may have a mode that would use the in-tree
+    # built Clang.
+    #
+    # 'nohost' is supposed to produce an LLVM Bitcode library,
+    # and it has to be done with a C/C++ compiler producing LLVM Bitcode
+    # compatible with the LLVM toolchain version distributed with the Flang
+    # compiler.
+    # In general, the in-tree built Clang should be used for 'nohost' build.
+    # Note that 'nohost' build does not produce the host version of Flang
+    # runtime library, so there will be two separate distributable objects.
+    # 'nohost' build is a TODO.
+
+    if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "host_device")
+      message(FATAL_ERROR "Unsupported OpenMP offload build of Flang runtime")
+    endif()
+    if (BUILD_SHARED_LIBS)
+      message(FATAL_ERROR
+        "BUILD_SHARED_LIBS is not supported for OpenMP offload build of Fortran runtime"
+        )
+    endif()
+
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
+        "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+
+      set(all_amdgpu_architectures
+        "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
+        "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
+        "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
+        "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
+        )
+      set(all_nvptx_architectures
+        "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
+        "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90"
+        )
+      set(all_gpu_architectures
+        "${all_amdgpu_architectures};${all_nvptx_architectures}"
+        )
+      # TODO: support auto detection on the build system.
+      if (FLANG_OMP_DEVICE_ARCHITECTURES STREQUAL "all")
+        set(FLANG_OMP_DEVICE_ARCHITECTURES ${all_gpu_architectures})
+      endif()
+      list(REMOVE_DUPLICATES FLANG_OMP_DEVICE_ARCHITECTURES)
+
+      string(REPLACE ";" "," compile_for_architectures
+        "${FLANG_OMP_DEVICE_ARCHITECTURES}"
+        )
+
+      set(OMP_COMPILE_OPTIONS
+        -fopenmp
+        -fvisibility=hidden
+        -fopenmp-cuda-mode
+        --offload-arch=${compile_for_architectures}
+        # Force LTO for the device part.
+        -foffload-lto
+        )
+      set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+        "${OMP_COMPILE_OPTIONS}"
+        )
+
+      # Enable "declare target" in the source code.
+      set_source_files_properties(${files}
+        PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
+        )
+    else()
+      message(FATAL_ERROR
+        "Flang runtime build is not supported for these compilers:\n"
+        "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
+        "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
+    endif()
+  endif()
+endmacro()
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index c0e4cff698e3cb..2a65a22ab674c4 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -171,10 +171,7 @@ set(sources
   utf.cpp
 )
 
-option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
-  "Compile Fortran runtime as CUDA sources (experimental)" OFF
-  )
-set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
+include(AddFlangOffloadRuntime)
 
 # List of files that are buildable for all devices.
 set(supported_files
@@ -227,128 +224,8 @@ set(supported_files
   utf.cpp
   )
 
-if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
-  if (BUILD_SHARED_LIBS)
-    message(FATAL_ERROR
-      "BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime"
-      )
-  endif()
-
-  enable_language(CUDA)
-
-  # TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION
-  # work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION.
-  set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
-  # Treat all supported sources as CUDA files.
-  set_source_files_properties(${supported_files} PROPERTIES LANGUAGE CUDA)
-  set(CUDA_COMPILE_OPTIONS)
-  if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang")
-    # Allow varargs.
-    set(CUDA_COMPILE_OPTIONS
-      -Xclang -fcuda-allow-variadic-functions
-      )
-  endif()
-  if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
-    set(CUDA_COMPILE_OPTIONS
-      --expt-relaxed-constexpr
-      # Disable these warnings:
-      #   'long double' is treated as 'double' in device code
-      -Xcudafe --diag_suppress=20208
-      -Xcudafe --display_error_number
-      )
-  endif()
-  set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
-    "${CUDA_COMPILE_OPTIONS}"
-    )
-
-  if (EXISTS "${FLANG_LIBCUDACXX_PATH}/include")
-    # When using libcudacxx headers files, we have to use them
-    # for all files of F18 runtime.
-    include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include)
-    add_compile_definitions(RT_USE_LIBCUDACXX=1)
-  endif()
-endif()
-
-set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
-  "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'")
-
-set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING
-  "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')")
-
-if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
-  # 'host_device' build only works with Clang compiler currently.
-  # The build is done with the CMAKE_C/CXX_COMPILER, i.e. it does not use
-  # the in-tree built Clang. We may have a mode that would use the in-tree
-  # built Clang.
-  #
-  # 'nohost' is supposed to produce an LLVM Bitcode library,
-  # and it has to be done with a C/C++ compiler producing LLVM Bitcode
-  # compatible with the LLVM toolchain version distributed with the Flang
-  # compiler.
-  # In general, the in-tree built Clang should be used for 'nohost' build.
-  # Note that 'nohost' build does not produce the host version of Flang
-  # runtime library, so there will be two separate distributable objects.
-  # 'nohost' build is a TODO.
-
-  if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "host_device")
-    message(FATAL_ERROR "Unsupported OpenMP offload build of Flang runtime")
-  endif()
-  if (BUILD_SHARED_LIBS)
-    message(FATAL_ERROR
-      "BUILD_SHARED_LIBS is not supported for OpenMP offload build of Fortran runtime"
-      )
-  endif()
-
-  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
-      "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
-
-    set(all_amdgpu_architectures
-      "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-      "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
-      "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
-      "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
-      )
-    set(all_nvptx_architectures
-      "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-      "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90"
-      )
-    set(all_gpu_architectures
-      "${all_amdgpu_architectures};${all_nvptx_architectures}"
-      )
-    # TODO: support auto detection on the build system.
-    if (FLANG_OMP_DEVICE_ARCHITECTURES STREQUAL "all")
-      set(FLANG_OMP_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-    endif()
-    list(REMOVE_DUPLICATES FLANG_OMP_DEVICE_ARCHITECTURES)
-
-    string(REPLACE ";" "," compile_for_architectures
-      "${FLANG_OMP_DEVICE_ARCHITECTURES}"
-      )
-
-    set(OMP_COMPILE_OPTIONS
-      -fopenmp
-      -fvisibility=hidden
-      -fopenmp-cuda-mode
-      --offload-arch=${compile_for_architectures}
-      # Force LTO for the device part.
-      -foffload-lto
-      )
-    set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
-      "${OMP_COMPILE_OPTIONS}"
-      )
-
-    # Enable "declare target" in the source code.
-    set_source_files_properties(${supported_files}
-      PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
-      )
-  else()
-    message(FATAL_ERROR
-      "Flang runtime build is not supported for these compilers:\n"
-      "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
-      "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
-  endif()
-endif()
+enable_cuda_compilation("${supported_files}")
+enable_omp_offload_compilation("${supported_files}")
 
 if (NOT TARGET FortranFloat128Math)
   # If FortranFloat128Math is not defined, then we are not building

>From a50d8f7d6182d2903cf983382bb8eabb05b8a15d Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 3 Apr 2024 18:12:19 -0700
Subject: [PATCH 2/3] [flang][runtime] Initial support for offload build of
 FortranDecimal.

---
 flang/lib/Decimal/CMakeLists.txt             | 10 ++++++++--
 flang/lib/Decimal/big-radix-floating-point.h |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt
index 2f6caa22e1562b..3d562b8e3ce1e5 100644
--- a/flang/lib/Decimal/CMakeLists.txt
+++ b/flang/lib/Decimal/CMakeLists.txt
@@ -49,11 +49,17 @@ endif()
 # avoid an unwanted dependency on libstdc++.so.
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 
-add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN
+set(sources
   binary-to-decimal.cpp
   decimal-to-binary.cpp
 )
 
+include(AddFlangOffloadRuntime)
+enable_cuda_compilation("${sources}")
+enable_omp_offload_compilation("${sources}")
+
+add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN ${sources})
+
 if (DEFINED MSVC)
   set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
   add_flang_library(FortranDecimal.static INSTALL_WITH_TOOLCHAIN
@@ -77,4 +83,4 @@ if (DEFINED MSVC)
   )
   add_dependencies(FortranDecimal FortranDecimal.static FortranDecimal.dynamic
     FortranDecimal.static_dbg FortranDecimal.dynamic_dbg)
-endif()
\ No newline at end of file
+endif()
diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h
index 2143d1d9b3f776..761000287a2025 100644
--- a/flang/lib/Decimal/big-radix-floating-point.h
+++ b/flang/lib/Decimal/big-radix-floating-point.h
@@ -30,6 +30,10 @@
 #include <limits>
 #include <type_traits>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace Fortran::decimal {
 
 static constexpr std::uint64_t TenToThe(int power) {

>From 7334cb01055a5a46bee3c3efb450afce3f03548f Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 3 Apr 2024 18:12:44 -0700
Subject: [PATCH 3/3] [flang][runtime] Added offload markup for FortranDecimal
 APIs.

---
 flang/lib/Decimal/big-radix-floating-point.h | 65 ++++++++++----------
 flang/lib/Decimal/binary-to-decimal.cpp      |  6 +-
 flang/lib/Decimal/decimal-to-binary.cpp      | 22 ++++---
 3 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h
index 761000287a2025..6ce8ae7925c150 100644
--- a/flang/lib/Decimal/big-radix-floating-point.h
+++ b/flang/lib/Decimal/big-radix-floating-point.h
@@ -68,15 +68,15 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   static constexpr int maxDigits{3 - minLog2AnyBit / log10Radix};
 
 public:
-  explicit BigRadixFloatingPointNumber(
+  explicit RT_API_ATTRS BigRadixFloatingPointNumber(
       enum FortranRounding rounding = RoundNearest)
       : rounding_{rounding} {}
 
   // Converts a binary floating point value.
-  explicit BigRadixFloatingPointNumber(
+  explicit RT_API_ATTRS BigRadixFloatingPointNumber(
       Real, enum FortranRounding = RoundNearest);
 
-  BigRadixFloatingPointNumber &SetToZero() {
+  RT_API_ATTRS BigRadixFloatingPointNumber &SetToZero() {
     isNegative_ = false;
     digits_ = 0;
     exponent_ = 0;
@@ -84,14 +84,14 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   }
 
   // Converts decimal floating-point to binary.
-  ConversionToBinaryResult<PREC> ConvertToBinary();
+  RT_API_ATTRS ConversionToBinaryResult<PREC> ConvertToBinary();
 
   // Parses and converts to binary.  Handles leading spaces,
   // "NaN", & optionally-signed "Inf".  Does not skip internal
   // spaces.
   // The argument is a reference to a pointer that is left
   // pointing to the first character that wasn't parsed.
-  ConversionToBinaryResult<PREC> ConvertToBinary(
+  RT_API_ATTRS ConversionToBinaryResult<PREC> ConvertToBinary(
       const char *&, const char *end = nullptr);
 
   // Formats a decimal floating-point number to a user buffer.
@@ -100,7 +100,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   // after the last digit; the effective decimal exponent is
   // returned as part of the result structure so that it can be
   // formatted by the client.
-  ConversionToDecimalResult ConvertToDecimal(
+  RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal(
       char *, std::size_t, enum DecimalConversionFlags, int digits) const;
 
   // Discard decimal digits not needed to distinguish this value
@@ -112,13 +112,14 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   // This minimization necessarily assumes that the value will be
   // emitted and read back into the same (or less precise) format
   // with default rounding to the nearest value.
-  void Minimize(
+  RT_API_ATTRS void Minimize(
       BigRadixFloatingPointNumber &&less, BigRadixFloatingPointNumber &&more);
 
   template <typename STREAM> STREAM &Dump(STREAM &) const;
 
 private:
-  BigRadixFloatingPointNumber(const BigRadixFloatingPointNumber &that)
+  RT_API_ATTRS BigRadixFloatingPointNumber(
+      const BigRadixFloatingPointNumber &that)
       : digits_{that.digits_}, exponent_{that.exponent_},
         isNegative_{that.isNegative_}, rounding_{that.rounding_} {
     for (int j{0}; j < digits_; ++j) {
@@ -126,7 +127,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
   }
 
-  bool IsZero() const {
+  RT_API_ATTRS bool IsZero() const {
     // Don't assume normalization.
     for (int j{0}; j < digits_; ++j) {
       if (digit_[j] != 0) {
@@ -140,13 +141,13 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   // (When this happens during decimal-to-binary conversion,
   // there are more digits in the input string than can be
   // represented precisely.)
-  bool IsFull() const {
+  RT_API_ATTRS bool IsFull() const {
     return digits_ == digitLimit_ && digit_[digits_ - 1] >= radix / 10;
   }
 
   // Sets *this to an unsigned integer value.
   // Returns any remainder.
-  template <typename UINT> UINT SetTo(UINT n) {
+  template <typename UINT> RT_API_ATTRS UINT SetTo(UINT n) {
     static_assert(
         std::is_same_v<UINT, common::uint128_t> || std::is_unsigned_v<UINT>);
     SetToZero();
@@ -173,7 +174,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
   }
 
-  int RemoveLeastOrderZeroDigits() {
+  RT_API_ATTRS int RemoveLeastOrderZeroDigits() {
     int remove{0};
     if (digits_ > 0 && digit_[0] == 0) {
       while (remove < digits_ && digit_[remove] == 0) {
@@ -197,25 +198,25 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     return remove;
   }
 
-  void RemoveLeadingZeroDigits() {
+  RT_API_ATTRS void RemoveLeadingZeroDigits() {
     while (digits_ > 0 && digit_[digits_ - 1] == 0) {
       --digits_;
     }
   }
 
-  void Normalize() {
+  RT_API_ATTRS void Normalize() {
     RemoveLeadingZeroDigits();
     exponent_ += RemoveLeastOrderZeroDigits() * log10Radix;
   }
 
   // This limited divisibility test only works for even divisors of the radix,
   // which is fine since it's only ever used with 2 and 5.
-  template <int N> bool IsDivisibleBy() const {
+  template <int N> RT_API_ATTRS bool IsDivisibleBy() const {
     static_assert(N > 1 && radix % N == 0, "bad modulus");
     return digits_ == 0 || (digit_[0] % N) == 0;
   }
 
-  template <unsigned DIVISOR> int DivideBy() {
+  template <unsigned DIVISOR> RT_API_ATTRS int DivideBy() {
     Digit remainder{0};
     for (int j{digits_ - 1}; j >= 0; --j) {
       Digit q{digit_[j] / DIVISOR};
@@ -226,7 +227,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     return remainder;
   }
 
-  void DivideByPowerOfTwo(int twoPow) { // twoPow <= log10Radix
+  RT_API_ATTRS void DivideByPowerOfTwo(int twoPow) { // twoPow <= log10Radix
     Digit remainder{0};
     auto mask{(Digit{1} << twoPow) - 1};
     auto coeff{radix >> twoPow};
@@ -238,7 +239,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   }
 
   // Returns true on overflow
-  bool DivideByPowerOfTwoInPlace(int twoPow) {
+  RT_API_ATTRS bool DivideByPowerOfTwoInPlace(int twoPow) {
     if (digits_ > 0) {
       while (twoPow > 0) {
         int chunk{twoPow > log10Radix ? log10Radix : twoPow};
@@ -268,7 +269,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     return false; // no overflow
   }
 
-  int AddCarry(int position = 0, int carry = 1) {
+  RT_API_ATTRS int AddCarry(int position = 0, int carry = 1) {
     for (; position < digits_; ++position) {
       Digit v{digit_[position] + carry};
       if (v < radix) {
@@ -290,13 +291,13 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     return carry;
   }
 
-  void Decrement() {
+  RT_API_ATTRS void Decrement() {
     for (int j{0}; digit_[j]-- == 0; ++j) {
       digit_[j] = radix - 1;
     }
   }
 
-  template <int N> int MultiplyByHelper(int carry = 0) {
+  template <int N> RT_API_ATTRS int MultiplyByHelper(int carry = 0) {
     for (int j{0}; j < digits_; ++j) {
       auto v{N * digit_[j] + carry};
       carry = v / radix;
@@ -305,7 +306,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     return carry;
   }
 
-  template <int N> int MultiplyBy(int carry = 0) {
+  template <int N> RT_API_ATTRS int MultiplyBy(int carry = 0) {
     if (int newCarry{MultiplyByHelper<N>(carry)}) {
       return AddCarry(digits_, newCarry);
     } else {
@@ -313,7 +314,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
   }
 
-  template <int N> int MultiplyWithoutNormalization() {
+  template <int N> RT_API_ATTRS int MultiplyWithoutNormalization() {
     if (int carry{MultiplyByHelper<N>(0)}) {
       if (digits_ < digitLimit_) {
         digit_[digits_++] = carry;
@@ -326,9 +327,9 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
   }
 
-  void LoseLeastSignificantDigit(); // with rounding
+  RT_API_ATTRS void LoseLeastSignificantDigit(); // with rounding
 
-  void PushCarry(int carry) {
+  RT_API_ATTRS void PushCarry(int carry) {
     if (digits_ == maxDigits && RemoveLeastOrderZeroDigits() == 0) {
       LoseLeastSignificantDigit();
       digit_[digits_ - 1] += carry;
@@ -340,18 +341,20 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
   // Adds another number and then divides by two.
   // Assumes same exponent and sign.
   // Returns true when the result has effectively been rounded down.
-  bool Mean(const BigRadixFloatingPointNumber &);
+  RT_API_ATTRS bool Mean(const BigRadixFloatingPointNumber &);
 
   // Parses a floating-point number; leaves the pointer reference
   // argument pointing at the next character after what was recognized.
   // The "end" argument can be left null if the caller is sure that the
   // string is properly terminated with an addressable character that
   // can't be in a valid floating-point character.
-  bool ParseNumber(const char *&, bool &inexact, const char *end);
+  RT_API_ATTRS bool ParseNumber(const char *&, bool &inexact, const char *end);
 
   using Raw = typename Real::RawType;
-  constexpr Raw SignBit() const { return Raw{isNegative_} << (Real::bits - 1); }
-  constexpr Raw Infinity() const {
+  constexpr RT_API_ATTRS Raw SignBit() const {
+    return Raw{isNegative_} << (Real::bits - 1);
+  }
+  constexpr RT_API_ATTRS Raw Infinity() const {
     Raw result{static_cast<Raw>(Real::maxExponent)};
     result <<= Real::significandBits;
     result |= SignBit();
@@ -360,7 +363,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
     return result;
   }
-  constexpr Raw NaN(bool isQuiet = true) {
+  constexpr RT_API_ATTRS Raw NaN(bool isQuiet = true) {
     Raw result{Real::maxExponent};
     result <<= Real::significandBits;
     result |= SignBit();
@@ -373,7 +376,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
     return result;
   }
-  constexpr Raw HUGE() const {
+  constexpr RT_API_ATTRS Raw HUGE() const {
     Raw result{static_cast<Raw>(Real::maxExponent)};
     result <<= Real::significandBits;
     result |= SignBit();
diff --git a/flang/lib/Decimal/binary-to-decimal.cpp b/flang/lib/Decimal/binary-to-decimal.cpp
index 55fc548a6979bd..b64865e95df24d 100644
--- a/flang/lib/Decimal/binary-to-decimal.cpp
+++ b/flang/lib/Decimal/binary-to-decimal.cpp
@@ -336,6 +336,8 @@ template ConversionToDecimalResult ConvertToDecimal<113>(char *, std::size_t,
     BinaryFloatingPointNumber<113>);
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 ConversionToDecimalResult ConvertFloatToDecimal(char *buffer, std::size_t size,
     enum DecimalConversionFlags flags, int digits,
     enum FortranRounding rounding, float x) {
@@ -365,7 +367,9 @@ ConversionToDecimalResult ConvertLongDoubleToDecimal(char *buffer,
       rounding, Fortran::decimal::BinaryFloatingPointNumber<113>(x));
 }
 #endif
-}
+
+RT_EXT_API_GROUP_END
+} // extern "C"
 
 template <int PREC, int LOG10RADIX>
 template <typename STREAM>
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index c5cdb72e355f62..dc4aa82ac6fe49 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -191,12 +191,12 @@ template <int PREC> class IntermediateFloat {
   static constexpr IntType topBit{IntType{1} << (precision - 1)};
   static constexpr IntType mask{topBit + (topBit - 1)};
 
-  IntermediateFloat() {}
+  RT_API_ATTRS IntermediateFloat() {}
   IntermediateFloat(const IntermediateFloat &) = default;
 
   // Assumes that exponent_ is valid on entry, and may increment it.
   // Returns the number of guard_ bits that have been determined.
-  template <typename UINT> bool SetTo(UINT n) {
+  template <typename UINT> RT_API_ATTRS bool SetTo(UINT n) {
     static constexpr int nBits{CHAR_BIT * sizeof n};
     if constexpr (precision >= nBits) {
       value_ = n;
@@ -218,14 +218,14 @@ template <int PREC> class IntermediateFloat {
     }
   }
 
-  void ShiftIn(int bit = 0) { value_ = value_ + value_ + bit; }
-  bool IsFull() const { return value_ >= topBit; }
-  void AdjustExponent(int by) { exponent_ += by; }
-  void SetGuard(int g) {
+  RT_API_ATTRS void ShiftIn(int bit = 0) { value_ = value_ + value_ + bit; }
+  RT_API_ATTRS bool IsFull() const { return value_ >= topBit; }
+  RT_API_ATTRS void AdjustExponent(int by) { exponent_ += by; }
+  RT_API_ATTRS void SetGuard(int g) {
     guard_ |= (static_cast<GuardType>(g & 6) << (guardBits - 3)) | (g & 1);
   }
 
-  ConversionToBinaryResult<PREC> ToBinary(
+  RT_API_ATTRS ConversionToBinaryResult<PREC> ToBinary(
       bool isNegative, FortranRounding) const;
 
 private:
@@ -241,7 +241,7 @@ template <int PREC> class IntermediateFloat {
 // The standard says that these overflow cases round to "representable"
 // numbers, and some popular compilers interpret that to mean +/-HUGE()
 // rather than +/-Inf.
-static inline constexpr bool RoundOverflowToHuge(
+static inline RT_API_ATTRS constexpr bool RoundOverflowToHuge(
     enum FortranRounding rounding, bool isNegative) {
   return rounding == RoundToZero || (!isNegative && rounding == RoundDown) ||
       (isNegative && rounding == RoundUp);
@@ -531,6 +531,8 @@ template ConversionToBinaryResult<113> ConvertToBinary<113>(
     const char *&, enum FortranRounding, const char *end);
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 enum ConversionResultFlags ConvertDecimalToFloat(
     const char **p, float *f, enum FortranRounding rounding) {
   auto result{Fortran::decimal::ConvertToBinary<24>(*p, rounding)};
@@ -552,5 +554,7 @@ enum ConversionResultFlags ConvertDecimalToLongDouble(
       reinterpret_cast<const void *>(&result.binary), sizeof *ld);
   return result.flags;
 }
-}
+
+RT_EXT_API_GROUP_END
+} // extern "C"
 } // namespace Fortran::decimal