[flang-commits] [flang] b329da8 - [flang][runtime] Support for offload build of FortranDecimal. (#87653)
via flang-commits
flang-commits at lists.llvm.org
Fri Apr 5 14:46:27 PDT 2024
Author: Slava Zakharin
Date: 2024-04-05T14:46:24-07:00
New Revision: b329da896c4959f6c56cf5e515d3412322f4b3c5
URL: https://github.com/llvm/llvm-project/commit/b329da896c4959f6c56cf5e515d3412322f4b3c5
DIFF: https://github.com/llvm/llvm-project/commit/b329da896c4959f6c56cf5e515d3412322f4b3c5.diff
LOG: [flang][runtime] Support for offload build of FortranDecimal. (#87653)
Added:
flang/cmake/modules/AddFlangOffloadRuntime.cmake
Modified:
flang/lib/Decimal/CMakeLists.txt
flang/lib/Decimal/big-radix-floating-point.h
flang/lib/Decimal/binary-to-decimal.cpp
flang/lib/Decimal/decimal-to-binary.cpp
flang/runtime/CMakeLists.txt
Removed:
################################################################################
diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
new file mode 100644
index 00000000000000..6fb6213e90fc49
--- /dev/null
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -0,0 +1,132 @@
+option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
+ "Compile Fortran runtime as CUDA sources (experimental)" OFF
+ )
+
+set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
+
+set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
+ "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'")
+
+set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING
+ "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')")
+
+macro(enable_cuda_compilation files)
+ if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
+ if (BUILD_SHARED_LIBS)
+ message(FATAL_ERROR
+ "BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime"
+ )
+ endif()
+
+ enable_language(CUDA)
+
+ # TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION
+ # work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION.
+ set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+ # Treat all supported sources as CUDA files.
+ set_source_files_properties(${files} PROPERTIES LANGUAGE CUDA)
+ set(CUDA_COMPILE_OPTIONS)
+ if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang")
+ # Allow varargs.
+ set(CUDA_COMPILE_OPTIONS
+ -Xclang -fcuda-allow-variadic-functions
+ )
+ endif()
+ if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
+ set(CUDA_COMPILE_OPTIONS
+ --expt-relaxed-constexpr
+ # Disable these warnings:
+ # 'long double' is treated as 'double' in device code
+ -Xcudafe --diag_suppress=20208
+ -Xcudafe --display_error_number
+ )
+ endif()
+ set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+ "${CUDA_COMPILE_OPTIONS}"
+ )
+
+ if (EXISTS "${FLANG_LIBCUDACXX_PATH}/include")
+ # When using libcudacxx headers files, we have to use them
+ # for all files of F18 runtime.
+ include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include)
+ add_compile_definitions(RT_USE_LIBCUDACXX=1)
+ endif()
+ endif()
+endmacro()
+
+macro(enable_omp_offload_compilation files)
+ if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
+ # 'host_device' build only works with Clang compiler currently.
+ # The build is done with the CMAKE_C/CXX_COMPILER, i.e. it does not use
+ # the in-tree built Clang. We may have a mode that would use the in-tree
+ # built Clang.
+ #
+ # 'nohost' is supposed to produce an LLVM Bitcode library,
+ # and it has to be done with a C/C++ compiler producing LLVM Bitcode
+ # compatible with the LLVM toolchain version distributed with the Flang
+ # compiler.
+ # In general, the in-tree built Clang should be used for 'nohost' build.
+ # Note that 'nohost' build does not produce the host version of Flang
+ # runtime library, so there will be two separate distributable objects.
+ # 'nohost' build is a TODO.
+
+ if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "host_device")
+ message(FATAL_ERROR "Unsupported OpenMP offload build of Flang runtime")
+ endif()
+ if (BUILD_SHARED_LIBS)
+ message(FATAL_ERROR
+ "BUILD_SHARED_LIBS is not supported for OpenMP offload build of Fortran runtime"
+ )
+ endif()
+
+ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
+ "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+
+ set(all_amdgpu_architectures
+ "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
+ "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
+ "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
+ "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
+ )
+ set(all_nvptx_architectures
+ "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
+ "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90"
+ )
+ set(all_gpu_architectures
+ "${all_amdgpu_architectures};${all_nvptx_architectures}"
+ )
+ # TODO: support auto detection on the build system.
+ if (FLANG_OMP_DEVICE_ARCHITECTURES STREQUAL "all")
+ set(FLANG_OMP_DEVICE_ARCHITECTURES ${all_gpu_architectures})
+ endif()
+ list(REMOVE_DUPLICATES FLANG_OMP_DEVICE_ARCHITECTURES)
+
+ string(REPLACE ";" "," compile_for_architectures
+ "${FLANG_OMP_DEVICE_ARCHITECTURES}"
+ )
+
+ set(OMP_COMPILE_OPTIONS
+ -fopenmp
+ -fvisibility=hidden
+ -fopenmp-cuda-mode
+ --offload-arch=${compile_for_architectures}
+ # Force LTO for the device part.
+ -foffload-lto
+ )
+ set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+ "${OMP_COMPILE_OPTIONS}"
+ )
+
+ # Enable "declare target" in the source code.
+ set_source_files_properties(${files}
+ PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
+ )
+ else()
+ message(FATAL_ERROR
+ "Flang runtime build is not supported for these compilers:\n"
+ "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
+ "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
+ endif()
+ endif()
+endmacro()
diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt
index 2f6caa22e1562b..3d562b8e3ce1e5 100644
--- a/flang/lib/Decimal/CMakeLists.txt
+++ b/flang/lib/Decimal/CMakeLists.txt
@@ -49,11 +49,17 @@ endif()
# avoid an unwanted dependency on libstdc++.so.
add_definitions(-U_GLIBCXX_ASSERTIONS)
-add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN
+set(sources
binary-to-decimal.cpp
decimal-to-binary.cpp
)
+include(AddFlangOffloadRuntime)
+enable_cuda_compilation("${sources}")
+enable_omp_offload_compilation("${sources}")
+
+add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN ${sources})
+
if (DEFINED MSVC)
set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
add_flang_library(FortranDecimal.static INSTALL_WITH_TOOLCHAIN
@@ -77,4 +83,4 @@ if (DEFINED MSVC)
)
add_dependencies(FortranDecimal FortranDecimal.static FortranDecimal.dynamic
FortranDecimal.static_dbg FortranDecimal.dynamic_dbg)
-endif()
\ No newline at end of file
+endif()
diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h
index 2143d1d9b3f776..6ce8ae7925c150 100644
--- a/flang/lib/Decimal/big-radix-floating-point.h
+++ b/flang/lib/Decimal/big-radix-floating-point.h
@@ -30,6 +30,10 @@
#include <limits>
#include <type_traits>
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
namespace Fortran::decimal {
static constexpr std::uint64_t TenToThe(int power) {
@@ -64,15 +68,15 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
static constexpr int maxDigits{3 - minLog2AnyBit / log10Radix};
public:
- explicit BigRadixFloatingPointNumber(
+ explicit RT_API_ATTRS BigRadixFloatingPointNumber(
enum FortranRounding rounding = RoundNearest)
: rounding_{rounding} {}
// Converts a binary floating point value.
- explicit BigRadixFloatingPointNumber(
+ explicit RT_API_ATTRS BigRadixFloatingPointNumber(
Real, enum FortranRounding = RoundNearest);
- BigRadixFloatingPointNumber &SetToZero() {
+ RT_API_ATTRS BigRadixFloatingPointNumber &SetToZero() {
isNegative_ = false;
digits_ = 0;
exponent_ = 0;
@@ -80,14 +84,14 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
// Converts decimal floating-point to binary.
- ConversionToBinaryResult<PREC> ConvertToBinary();
+ RT_API_ATTRS ConversionToBinaryResult<PREC> ConvertToBinary();
// Parses and converts to binary. Handles leading spaces,
// "NaN", & optionally-signed "Inf". Does not skip internal
// spaces.
// The argument is a reference to a pointer that is left
// pointing to the first character that wasn't parsed.
- ConversionToBinaryResult<PREC> ConvertToBinary(
+ RT_API_ATTRS ConversionToBinaryResult<PREC> ConvertToBinary(
const char *&, const char *end = nullptr);
// Formats a decimal floating-point number to a user buffer.
@@ -96,7 +100,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
// after the last digit; the effective decimal exponent is
// returned as part of the result structure so that it can be
// formatted by the client.
- ConversionToDecimalResult ConvertToDecimal(
+ RT_API_ATTRS ConversionToDecimalResult ConvertToDecimal(
char *, std::size_t, enum DecimalConversionFlags, int digits) const;
// Discard decimal digits not needed to distinguish this value
@@ -108,13 +112,14 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
// This minimization necessarily assumes that the value will be
// emitted and read back into the same (or less precise) format
// with default rounding to the nearest value.
- void Minimize(
+ RT_API_ATTRS void Minimize(
BigRadixFloatingPointNumber &&less, BigRadixFloatingPointNumber &&more);
template <typename STREAM> STREAM &Dump(STREAM &) const;
private:
- BigRadixFloatingPointNumber(const BigRadixFloatingPointNumber &that)
+ RT_API_ATTRS BigRadixFloatingPointNumber(
+ const BigRadixFloatingPointNumber &that)
: digits_{that.digits_}, exponent_{that.exponent_},
isNegative_{that.isNegative_}, rounding_{that.rounding_} {
for (int j{0}; j < digits_; ++j) {
@@ -122,7 +127,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
}
- bool IsZero() const {
+ RT_API_ATTRS bool IsZero() const {
// Don't assume normalization.
for (int j{0}; j < digits_; ++j) {
if (digit_[j] != 0) {
@@ -136,13 +141,13 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
// (When this happens during decimal-to-binary conversion,
// there are more digits in the input string than can be
// represented precisely.)
- bool IsFull() const {
+ RT_API_ATTRS bool IsFull() const {
return digits_ == digitLimit_ && digit_[digits_ - 1] >= radix / 10;
}
// Sets *this to an unsigned integer value.
// Returns any remainder.
- template <typename UINT> UINT SetTo(UINT n) {
+ template <typename UINT> RT_API_ATTRS UINT SetTo(UINT n) {
static_assert(
std::is_same_v<UINT, common::uint128_t> || std::is_unsigned_v<UINT>);
SetToZero();
@@ -169,7 +174,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
}
- int RemoveLeastOrderZeroDigits() {
+ RT_API_ATTRS int RemoveLeastOrderZeroDigits() {
int remove{0};
if (digits_ > 0 && digit_[0] == 0) {
while (remove < digits_ && digit_[remove] == 0) {
@@ -193,25 +198,25 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
return remove;
}
- void RemoveLeadingZeroDigits() {
+ RT_API_ATTRS void RemoveLeadingZeroDigits() {
while (digits_ > 0 && digit_[digits_ - 1] == 0) {
--digits_;
}
}
- void Normalize() {
+ RT_API_ATTRS void Normalize() {
RemoveLeadingZeroDigits();
exponent_ += RemoveLeastOrderZeroDigits() * log10Radix;
}
// This limited divisibility test only works for even divisors of the radix,
// which is fine since it's only ever used with 2 and 5.
- template <int N> bool IsDivisibleBy() const {
+ template <int N> RT_API_ATTRS bool IsDivisibleBy() const {
static_assert(N > 1 && radix % N == 0, "bad modulus");
return digits_ == 0 || (digit_[0] % N) == 0;
}
- template <unsigned DIVISOR> int DivideBy() {
+ template <unsigned DIVISOR> RT_API_ATTRS int DivideBy() {
Digit remainder{0};
for (int j{digits_ - 1}; j >= 0; --j) {
Digit q{digit_[j] / DIVISOR};
@@ -222,7 +227,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
return remainder;
}
- void DivideByPowerOfTwo(int twoPow) { // twoPow <= log10Radix
+ RT_API_ATTRS void DivideByPowerOfTwo(int twoPow) { // twoPow <= log10Radix
Digit remainder{0};
auto mask{(Digit{1} << twoPow) - 1};
auto coeff{radix >> twoPow};
@@ -234,7 +239,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
// Returns true on overflow
- bool DivideByPowerOfTwoInPlace(int twoPow) {
+ RT_API_ATTRS bool DivideByPowerOfTwoInPlace(int twoPow) {
if (digits_ > 0) {
while (twoPow > 0) {
int chunk{twoPow > log10Radix ? log10Radix : twoPow};
@@ -264,7 +269,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
return false; // no overflow
}
- int AddCarry(int position = 0, int carry = 1) {
+ RT_API_ATTRS int AddCarry(int position = 0, int carry = 1) {
for (; position < digits_; ++position) {
Digit v{digit_[position] + carry};
if (v < radix) {
@@ -286,13 +291,13 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
return carry;
}
- void Decrement() {
+ RT_API_ATTRS void Decrement() {
for (int j{0}; digit_[j]-- == 0; ++j) {
digit_[j] = radix - 1;
}
}
- template <int N> int MultiplyByHelper(int carry = 0) {
+ template <int N> RT_API_ATTRS int MultiplyByHelper(int carry = 0) {
for (int j{0}; j < digits_; ++j) {
auto v{N * digit_[j] + carry};
carry = v / radix;
@@ -301,7 +306,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
return carry;
}
- template <int N> int MultiplyBy(int carry = 0) {
+ template <int N> RT_API_ATTRS int MultiplyBy(int carry = 0) {
if (int newCarry{MultiplyByHelper<N>(carry)}) {
return AddCarry(digits_, newCarry);
} else {
@@ -309,7 +314,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
}
- template <int N> int MultiplyWithoutNormalization() {
+ template <int N> RT_API_ATTRS int MultiplyWithoutNormalization() {
if (int carry{MultiplyByHelper<N>(0)}) {
if (digits_ < digitLimit_) {
digit_[digits_++] = carry;
@@ -322,9 +327,9 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
}
- void LoseLeastSignificantDigit(); // with rounding
+ RT_API_ATTRS void LoseLeastSignificantDigit(); // with rounding
- void PushCarry(int carry) {
+ RT_API_ATTRS void PushCarry(int carry) {
if (digits_ == maxDigits && RemoveLeastOrderZeroDigits() == 0) {
LoseLeastSignificantDigit();
digit_[digits_ - 1] += carry;
@@ -336,18 +341,20 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
// Adds another number and then divides by two.
// Assumes same exponent and sign.
// Returns true when the result has effectively been rounded down.
- bool Mean(const BigRadixFloatingPointNumber &);
+ RT_API_ATTRS bool Mean(const BigRadixFloatingPointNumber &);
// Parses a floating-point number; leaves the pointer reference
// argument pointing at the next character after what was recognized.
// The "end" argument can be left null if the caller is sure that the
// string is properly terminated with an addressable character that
// can't be in a valid floating-point character.
- bool ParseNumber(const char *&, bool &inexact, const char *end);
+ RT_API_ATTRS bool ParseNumber(const char *&, bool &inexact, const char *end);
using Raw = typename Real::RawType;
- constexpr Raw SignBit() const { return Raw{isNegative_} << (Real::bits - 1); }
- constexpr Raw Infinity() const {
+ constexpr RT_API_ATTRS Raw SignBit() const {
+ return Raw{isNegative_} << (Real::bits - 1);
+ }
+ constexpr RT_API_ATTRS Raw Infinity() const {
Raw result{static_cast<Raw>(Real::maxExponent)};
result <<= Real::significandBits;
result |= SignBit();
@@ -356,7 +363,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
return result;
}
- constexpr Raw NaN(bool isQuiet = true) {
+ constexpr RT_API_ATTRS Raw NaN(bool isQuiet = true) {
Raw result{Real::maxExponent};
result <<= Real::significandBits;
result |= SignBit();
@@ -369,7 +376,7 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
}
return result;
}
- constexpr Raw HUGE() const {
+ constexpr RT_API_ATTRS Raw HUGE() const {
Raw result{static_cast<Raw>(Real::maxExponent)};
result <<= Real::significandBits;
result |= SignBit();
diff --git a/flang/lib/Decimal/binary-to-decimal.cpp b/flang/lib/Decimal/binary-to-decimal.cpp
index 55fc548a6979bd..b64865e95df24d 100644
--- a/flang/lib/Decimal/binary-to-decimal.cpp
+++ b/flang/lib/Decimal/binary-to-decimal.cpp
@@ -336,6 +336,8 @@ template ConversionToDecimalResult ConvertToDecimal<113>(char *, std::size_t,
BinaryFloatingPointNumber<113>);
extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
ConversionToDecimalResult ConvertFloatToDecimal(char *buffer, std::size_t size,
enum DecimalConversionFlags flags, int digits,
enum FortranRounding rounding, float x) {
@@ -365,7 +367,9 @@ ConversionToDecimalResult ConvertLongDoubleToDecimal(char *buffer,
rounding, Fortran::decimal::BinaryFloatingPointNumber<113>(x));
}
#endif
-}
+
+RT_EXT_API_GROUP_END
+} // extern "C"
template <int PREC, int LOG10RADIX>
template <typename STREAM>
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index c5cdb72e355f62..dc4aa82ac6fe49 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -191,12 +191,12 @@ template <int PREC> class IntermediateFloat {
static constexpr IntType topBit{IntType{1} << (precision - 1)};
static constexpr IntType mask{topBit + (topBit - 1)};
- IntermediateFloat() {}
+ RT_API_ATTRS IntermediateFloat() {}
IntermediateFloat(const IntermediateFloat &) = default;
// Assumes that exponent_ is valid on entry, and may increment it.
// Returns the number of guard_ bits that have been determined.
- template <typename UINT> bool SetTo(UINT n) {
+ template <typename UINT> RT_API_ATTRS bool SetTo(UINT n) {
static constexpr int nBits{CHAR_BIT * sizeof n};
if constexpr (precision >= nBits) {
value_ = n;
@@ -218,14 +218,14 @@ template <int PREC> class IntermediateFloat {
}
}
- void ShiftIn(int bit = 0) { value_ = value_ + value_ + bit; }
- bool IsFull() const { return value_ >= topBit; }
- void AdjustExponent(int by) { exponent_ += by; }
- void SetGuard(int g) {
+ RT_API_ATTRS void ShiftIn(int bit = 0) { value_ = value_ + value_ + bit; }
+ RT_API_ATTRS bool IsFull() const { return value_ >= topBit; }
+ RT_API_ATTRS void AdjustExponent(int by) { exponent_ += by; }
+ RT_API_ATTRS void SetGuard(int g) {
guard_ |= (static_cast<GuardType>(g & 6) << (guardBits - 3)) | (g & 1);
}
- ConversionToBinaryResult<PREC> ToBinary(
+ RT_API_ATTRS ConversionToBinaryResult<PREC> ToBinary(
bool isNegative, FortranRounding) const;
private:
@@ -241,7 +241,7 @@ template <int PREC> class IntermediateFloat {
// The standard says that these overflow cases round to "representable"
// numbers, and some popular compilers interpret that to mean +/-HUGE()
// rather than +/-Inf.
-static inline constexpr bool RoundOverflowToHuge(
+static inline RT_API_ATTRS constexpr bool RoundOverflowToHuge(
enum FortranRounding rounding, bool isNegative) {
return rounding == RoundToZero || (!isNegative && rounding == RoundDown) ||
(isNegative && rounding == RoundUp);
@@ -531,6 +531,8 @@ template ConversionToBinaryResult<113> ConvertToBinary<113>(
const char *&, enum FortranRounding, const char *end);
extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
enum ConversionResultFlags ConvertDecimalToFloat(
const char **p, float *f, enum FortranRounding rounding) {
auto result{Fortran::decimal::ConvertToBinary<24>(*p, rounding)};
@@ -552,5 +554,7 @@ enum ConversionResultFlags ConvertDecimalToLongDouble(
reinterpret_cast<const void *>(&result.binary), sizeof *ld);
return result.flags;
}
-}
+
+RT_EXT_API_GROUP_END
+} // extern "C"
} // namespace Fortran::decimal
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index c0e4cff698e3cb..2a65a22ab674c4 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -171,10 +171,7 @@ set(sources
utf.cpp
)
-option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
- "Compile Fortran runtime as CUDA sources (experimental)" OFF
- )
-set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
+include(AddFlangOffloadRuntime)
# List of files that are buildable for all devices.
set(supported_files
@@ -227,128 +224,8 @@ set(supported_files
utf.cpp
)
-if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
- if (BUILD_SHARED_LIBS)
- message(FATAL_ERROR
- "BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime"
- )
- endif()
-
- enable_language(CUDA)
-
- # TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION
- # work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION.
- set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
- # Treat all supported sources as CUDA files.
- set_source_files_properties(${supported_files} PROPERTIES LANGUAGE CUDA)
- set(CUDA_COMPILE_OPTIONS)
- if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang")
- # Allow varargs.
- set(CUDA_COMPILE_OPTIONS
- -Xclang -fcuda-allow-variadic-functions
- )
- endif()
- if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
- set(CUDA_COMPILE_OPTIONS
- --expt-relaxed-constexpr
- # Disable these warnings:
- # 'long double' is treated as 'double' in device code
- -Xcudafe --diag_suppress=20208
- -Xcudafe --display_error_number
- )
- endif()
- set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
- "${CUDA_COMPILE_OPTIONS}"
- )
-
- if (EXISTS "${FLANG_LIBCUDACXX_PATH}/include")
- # When using libcudacxx headers files, we have to use them
- # for all files of F18 runtime.
- include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include)
- add_compile_definitions(RT_USE_LIBCUDACXX=1)
- endif()
-endif()
-
-set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
- "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'")
-
-set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING
- "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')")
-
-if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
- # 'host_device' build only works with Clang compiler currently.
- # The build is done with the CMAKE_C/CXX_COMPILER, i.e. it does not use
- # the in-tree built Clang. We may have a mode that would use the in-tree
- # built Clang.
- #
- # 'nohost' is supposed to produce an LLVM Bitcode library,
- # and it has to be done with a C/C++ compiler producing LLVM Bitcode
- # compatible with the LLVM toolchain version distributed with the Flang
- # compiler.
- # In general, the in-tree built Clang should be used for 'nohost' build.
- # Note that 'nohost' build does not produce the host version of Flang
- # runtime library, so there will be two separate distributable objects.
- # 'nohost' build is a TODO.
-
- if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "host_device")
- message(FATAL_ERROR "Unsupported OpenMP offload build of Flang runtime")
- endif()
- if (BUILD_SHARED_LIBS)
- message(FATAL_ERROR
- "BUILD_SHARED_LIBS is not supported for OpenMP offload build of Fortran runtime"
- )
- endif()
-
- if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
- "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
-
- set(all_amdgpu_architectures
- "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
- "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
- "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
- "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
- )
- set(all_nvptx_architectures
- "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
- "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90"
- )
- set(all_gpu_architectures
- "${all_amdgpu_architectures};${all_nvptx_architectures}"
- )
- # TODO: support auto detection on the build system.
- if (FLANG_OMP_DEVICE_ARCHITECTURES STREQUAL "all")
- set(FLANG_OMP_DEVICE_ARCHITECTURES ${all_gpu_architectures})
- endif()
- list(REMOVE_DUPLICATES FLANG_OMP_DEVICE_ARCHITECTURES)
-
- string(REPLACE ";" "," compile_for_architectures
- "${FLANG_OMP_DEVICE_ARCHITECTURES}"
- )
-
- set(OMP_COMPILE_OPTIONS
- -fopenmp
- -fvisibility=hidden
- -fopenmp-cuda-mode
- --offload-arch=${compile_for_architectures}
- # Force LTO for the device part.
- -foffload-lto
- )
- set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
- "${OMP_COMPILE_OPTIONS}"
- )
-
- # Enable "declare target" in the source code.
- set_source_files_properties(${supported_files}
- PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
- )
- else()
- message(FATAL_ERROR
- "Flang runtime build is not supported for these compilers:\n"
- "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
- "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
- endif()
-endif()
+enable_cuda_compilation("${supported_files}")
+enable_omp_offload_compilation("${supported_files}")
if (NOT TARGET FortranFloat128Math)
# If FortranFloat128Math is not defined, then we are not building
More information about the flang-commits
mailing list