[flang-commits] [flang] [flang][runtime] Enable more code for offload device builds. (PR #67489)

Slava Zakharin via flang-commits flang-commits at lists.llvm.org
Tue Sep 26 14:14:02 PDT 2023


https://github.com/vzakhari created https://github.com/llvm/llvm-project/pull/67489

I extended the "closure" of the device code containing the initial
transformational.cpp. The device side of the library should not be
complete at least for some APIs. For example, I tested with C OpenMP
code calling BesselJnX0 with a nullptr descriptor that failed with
a runtime error when executing on a GPU.

I added `--expt-relaxed-constexpr` for NVCC compiler to avoid multiple
warnings about missing __attribute__((device)) on constexpr methods
coming from C++ header files.


>From cc925ab9cabca65b2e872146c435d33956a5eddc Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Tue, 26 Sep 2023 13:22:01 -0700
Subject: [PATCH] [flang][runtime] Enable more code for offload device builds.

I extended the "closure" of the device code containing the initial
transformational.cpp. The device side of the library should not be
complete at least for some APIs. For example, I tested with C OpenMP
code calling BesselJnX0 with a nullptr descriptor that failed with
a runtime error when executing on a GPU.

I added `--expt-relaxed-constexpr` for NVCC compiler to avoid multiple
warnings about missing __attribute__((device)) on constexpr methods
coming from C++ header files.
---
 flang/include/flang/Runtime/api-attrs.h  |  23 +++++
 flang/include/flang/Runtime/descriptor.h |   8 +-
 flang/include/flang/Runtime/memory.h     | 105 +++++++++++++++++++++--
 flang/include/flang/Runtime/type-code.h  |  18 ++--
 flang/runtime/CMakeLists.txt             |   8 ++
 flang/runtime/ISO_Fortran_util.h         |  10 +--
 flang/runtime/derived.h                  |  10 ++-
 flang/runtime/descriptor.cpp             |  53 ++++++------
 flang/runtime/terminator.cpp             |  77 +++++++++++++----
 flang/runtime/terminator.h               |  71 ++++++++++++---
 flang/runtime/type-code.cpp              |   9 +-
 flang/runtime/type-info.h                |  84 +++++++++---------
 12 files changed, 350 insertions(+), 126 deletions(-)

diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Runtime/api-attrs.h
index a866625a7b95ba4..0768682cadbdcbb 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Runtime/api-attrs.h
@@ -42,6 +42,18 @@
 #endif
 #endif /* !defined(RT_EXT_API_GROUP_END) */
 
+/*
+ * RT_OFFLOAD_API_GROUP_BEGIN/END pair is placed around definitions
+ * of functions that can be referenced in other modules of Flang
+ * runtime. For OpenMP offload these functions are made "declare target"
+ * making sure they are compiled for the target even though direct
+ * references to them from other "declare target" functions may not
+ * be seen. Host-only functions should not be put in between these
+ * two macros.
+ */
+#define RT_OFFLOAD_API_GROUP_BEGIN RT_EXT_API_GROUP_BEGIN
+#define RT_OFFLOAD_API_GROUP_END RT_EXT_API_GROUP_END
+
 /*
  * RT_VAR_GROUP_BEGIN/END pair is placed around definitions
  * of module scope variables referenced by Flang runtime (directly
@@ -88,4 +100,15 @@
 #endif
 #endif /* !defined(RT_CONST_VAR_ATTRS) */
 
+/*
+ * RT_DEVICE_COMPILATION is defined for any device compilation.
+ * Note that it can only be used reliably with compilers that perform
+ * separate host and device compilations.
+ */
+#if ((defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)) || (defined(_OPENMP) && (defined(__AMDGCN__) || defined(__NVPTX__)))
+#define RT_DEVICE_COMPILATION 1
+#else
+#undef RT_DEVICE_COMPILATION
+#endif
+
 #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 62a8d123bf2ee06..09077ec849acee0 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -181,19 +181,19 @@ class Descriptor {
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
 
   // CUDA_TODO: Clang does not support unique_ptr on device.
-  static OwningPtr<Descriptor> Create(TypeCode t, std::size_t elementBytes,
+  static RT_API_ATTRS OwningPtr<Descriptor> Create(TypeCode t, std::size_t elementBytes,
       void *p = nullptr, int rank = maxRank,
       const SubscriptValue *extent = nullptr,
       ISO::CFI_attribute_t attribute = CFI_attribute_other,
       int derivedTypeLenParameters = 0);
-  static OwningPtr<Descriptor> Create(TypeCategory, int kind, void *p = nullptr,
+  static RT_API_ATTRS OwningPtr<Descriptor> Create(TypeCategory, int kind, void *p = nullptr,
       int rank = maxRank, const SubscriptValue *extent = nullptr,
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
-  static OwningPtr<Descriptor> Create(int characterKind,
+  static RT_API_ATTRS OwningPtr<Descriptor> Create(int characterKind,
       SubscriptValue characters, void *p = nullptr, int rank = maxRank,
       const SubscriptValue *extent = nullptr,
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
-  static OwningPtr<Descriptor> Create(const typeInfo::DerivedType &dt,
+  static RT_API_ATTRS OwningPtr<Descriptor> Create(const typeInfo::DerivedType &dt,
       void *p = nullptr, int rank = maxRank,
       const SubscriptValue *extent = nullptr,
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
diff --git a/flang/include/flang/Runtime/memory.h b/flang/include/flang/Runtime/memory.h
index 0afe5250169d0b4..579ba78a1c93b20 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang/include/flang/Runtime/memory.h
@@ -12,19 +12,22 @@
 #ifndef FORTRAN_RUNTIME_MEMORY_H_
 #define FORTRAN_RUNTIME_MEMORY_H_
 
+#include "flang/Runtime/api-attrs.h"
+#include <cassert>
 #include <memory>
+#include <type_traits>
 
 namespace Fortran::runtime {
 
 class Terminator;
 
-[[nodiscard]] void *AllocateMemoryOrCrash(
+[[nodiscard]] RT_API_ATTRS void *AllocateMemoryOrCrash(
     const Terminator &, std::size_t bytes);
 template <typename A> [[nodiscard]] A &AllocateOrCrash(const Terminator &t) {
   return *reinterpret_cast<A *>(AllocateMemoryOrCrash(t, sizeof(A)));
 }
-void FreeMemory(void *);
-template <typename A> void FreeMemory(A *p) {
+RT_API_ATTRS void FreeMemory(void *);
+template <typename A> RT_API_ATTRS void FreeMemory(A *p) {
   FreeMemory(reinterpret_cast<void *>(p));
 }
 template <typename A> void FreeMemoryAndNullify(A *&p) {
@@ -32,11 +35,101 @@ template <typename A> void FreeMemoryAndNullify(A *&p) {
   p = nullptr;
 }
 
-template <typename A> struct OwningPtrDeleter {
-  void operator()(A *p) { FreeMemory(p); }
+// Very basic implementation mimicking std::unique_ptr.
+// It should work for any offload device compiler.
+// It uses a fixed memory deleter based on FreeMemory(),
+// and does not support array objects with runtime length.
+template <typename A>
+class OwningPtr {
+public:
+  using pointer_type = A *;
+
+  OwningPtr() = default;
+  RT_API_ATTRS explicit OwningPtr(pointer_type p) : ptr_(p) {}
+  RT_API_ATTRS OwningPtr(const OwningPtr &) = delete;
+  RT_API_ATTRS OwningPtr& operator=(const OwningPtr &) = delete;
+  RT_API_ATTRS OwningPtr(OwningPtr &&other) {
+    ptr_ = other.ptr_;
+    other.ptr_ = pointer_type();
+  }
+  RT_API_ATTRS OwningPtr &operator=(OwningPtr &&other) {
+    if (this != &other) {
+      delete_ptr(ptr_);
+      ptr_ = other.ptr_;
+      other.ptr_ = pointer_type();
+    }
+    return *this;
+  }
+  constexpr RT_API_ATTRS OwningPtr(std::nullptr_t) : OwningPtr() { }
+
+  // Delete the pointer, if owns one.
+  RT_API_ATTRS ~OwningPtr() {
+    if (ptr_ != pointer_type()) {
+      delete_ptr(ptr_);
+      ptr_ = pointer_type();
+    }
+  }
+
+  // Release the ownership.
+  RT_API_ATTRS pointer_type release() {
+    pointer_type p = ptr_;
+    ptr_ = pointer_type();
+    return p;
+  }
+
+  // Replace the pointer.
+  RT_API_ATTRS void reset(pointer_type p = pointer_type()) {
+    std::swap(ptr_, p);
+    if (p != pointer_type()) {
+      // Delete the owned pointer.
+      delete_ptr(p);
+    }
+  }
+
+  // Exchange the pointer with another object.
+  RT_API_ATTRS void swap(OwningPtr &other) {
+    std::swap(ptr_, other.ptr_);
+  }
+
+  // Get the stored pointer.
+  RT_API_ATTRS pointer_type get() const {
+    return ptr_;
+  }
+
+  RT_API_ATTRS explicit operator bool() const {
+    return get() == pointer_type() ? false : true;
+  }
+
+  RT_API_ATTRS typename std::add_lvalue_reference<A>::type operator*() const {
+    assert(get() != pointer_type());
+    return *get();
+  }
+
+  RT_API_ATTRS pointer_type operator->() const {
+    return get();
+  }
+
+private:
+  RT_API_ATTRS void delete_ptr(pointer_type p) {
+    FreeMemory(p);
+  }
+  pointer_type ptr_{};
 };
 
-template <typename A> using OwningPtr = std::unique_ptr<A, OwningPtrDeleter<A>>;
+template <typename X, typename Y>
+inline RT_API_ATTRS bool operator!=(const OwningPtr<X> &x, const OwningPtr<Y> &y) {
+  return x.get() != y.get();
+}
+
+template <typename X>
+inline RT_API_ATTRS bool operator!=(const OwningPtr<X> &x, std::nullptr_t) {
+  return (bool)x;
+}
+
+template <typename X>
+inline RT_API_ATTRS bool operator!=(std::nullptr_t, const OwningPtr<X> &x) {
+  return (bool)x;
+}
 
 template <typename A> class SizedNew {
 public:
diff --git a/flang/include/flang/Runtime/type-code.h b/flang/include/flang/Runtime/type-code.h
index fb18dba54980f69..172355609e26128 100644
--- a/flang/include/flang/Runtime/type-code.h
+++ b/flang/include/flang/Runtime/type-code.h
@@ -26,29 +26,29 @@ class TypeCode {
 
   RT_API_ATTRS int raw() const { return raw_; }
 
-  constexpr bool IsValid() const {
+  constexpr RT_API_ATTRS bool IsValid() const {
     return raw_ >= CFI_type_signed_char && raw_ <= CFI_TYPE_LAST;
   }
-  constexpr bool IsInteger() const {
+  constexpr RT_API_ATTRS bool IsInteger() const {
     return raw_ >= CFI_type_signed_char && raw_ <= CFI_type_ptrdiff_t;
   }
-  constexpr bool IsReal() const {
+  constexpr RT_API_ATTRS bool IsReal() const {
     return raw_ >= CFI_type_half_float && raw_ <= CFI_type_float128;
   }
-  constexpr bool IsComplex() const {
+  constexpr RT_API_ATTRS bool IsComplex() const {
     return raw_ >= CFI_type_half_float_Complex &&
         raw_ <= CFI_type_float128_Complex;
   }
-  constexpr bool IsCharacter() const {
+  constexpr RT_API_ATTRS bool IsCharacter() const {
     return raw_ == CFI_type_char || raw_ == CFI_type_char16_t ||
         raw_ == CFI_type_char32_t;
   }
-  constexpr bool IsLogical() const {
+  constexpr RT_API_ATTRS bool IsLogical() const {
     return raw_ == CFI_type_Bool ||
         (raw_ >= CFI_type_int_least8_t && raw_ <= CFI_type_int_least64_t);
   }
-  constexpr bool IsDerived() const { return raw_ == CFI_type_struct; }
-  constexpr bool IsIntrinsic() const { return IsValid() && !IsDerived(); }
+  constexpr RT_API_ATTRS bool IsDerived() const { return raw_ == CFI_type_struct; }
+  constexpr RT_API_ATTRS bool IsIntrinsic() const { return IsValid() && !IsDerived(); }
 
   RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
   GetCategoryAndKind() const;
@@ -65,7 +65,7 @@ class TypeCode {
       return thisCK && thatCK && *thisCK == *thatCK;
     }
   }
-  bool operator!=(TypeCode that) const { return !(*this == that); }
+  RT_API_ATTRS bool operator!=(TypeCode that) const { return !(*this == that); }
 
 private:
   ISO::CFI_type_t raw_{CFI_type_other};
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 5b23065a32d1699..e7d416749219ef6 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -150,7 +150,10 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
 
 # List of files that are buildable for all devices.
 set(supported_files
+  descriptor.cpp
+  terminator.cpp
   transformational.cpp
+  type-code.cpp
   )
 
 if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
@@ -175,6 +178,11 @@ if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
       -Xclang -fcuda-allow-variadic-functions
       )
   endif()
+  if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
+    set(CUDA_COMPILE_OPTIONS
+      --expt-relaxed-constexpr
+      )
+  endif()
   set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
     "${CUDA_COMPILE_OPTIONS}"
     )
diff --git a/flang/runtime/ISO_Fortran_util.h b/flang/runtime/ISO_Fortran_util.h
index 7d527bfd65789d8..d63cda8931f37b7 100644
--- a/flang/runtime/ISO_Fortran_util.h
+++ b/flang/runtime/ISO_Fortran_util.h
@@ -18,15 +18,15 @@
 #include <cstdlib>
 
 namespace Fortran::ISO {
-static inline constexpr bool IsCharacterType(CFI_type_t ty) {
+static inline constexpr RT_API_ATTRS bool IsCharacterType(CFI_type_t ty) {
   return ty == CFI_type_char || ty == CFI_type_char16_t ||
       ty == CFI_type_char32_t;
 }
-static inline constexpr bool IsAssumedSize(const CFI_cdesc_t *dv) {
+static inline constexpr RT_API_ATTRS bool IsAssumedSize(const CFI_cdesc_t *dv) {
   return dv->rank > 0 && dv->dim[dv->rank - 1].extent == -1;
 }
 
-static inline std::size_t MinElemLen(CFI_type_t type) {
+static inline RT_API_ATTRS std::size_t MinElemLen(CFI_type_t type) {
   auto typeParams{Fortran::runtime::TypeCode{type}.GetCategoryAndKind()};
   if (!typeParams) {
     Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
@@ -38,7 +38,7 @@ static inline std::size_t MinElemLen(CFI_type_t type) {
       typeParams->first, typeParams->second);
 }
 
-static inline int VerifyEstablishParameters(CFI_cdesc_t *descriptor,
+static inline RT_API_ATTRS int VerifyEstablishParameters(CFI_cdesc_t *descriptor,
     void *base_addr, CFI_attribute_t attribute, CFI_type_t type,
     std::size_t elem_len, CFI_rank_t rank, const CFI_index_t extents[],
     bool external) {
@@ -77,7 +77,7 @@ static inline int VerifyEstablishParameters(CFI_cdesc_t *descriptor,
   return CFI_SUCCESS;
 }
 
-static inline void EstablishDescriptor(CFI_cdesc_t *descriptor, void *base_addr,
+static inline RT_API_ATTRS void EstablishDescriptor(CFI_cdesc_t *descriptor, void *base_addr,
     CFI_attribute_t attribute, CFI_type_t type, std::size_t elem_len,
     CFI_rank_t rank, const CFI_index_t extents[]) {
   descriptor->base_addr = base_addr;
diff --git a/flang/runtime/derived.h b/flang/runtime/derived.h
index 747a93303e0dbc0..6b9ea907fda9b8b 100644
--- a/flang/runtime/derived.h
+++ b/flang/runtime/derived.h
@@ -11,6 +11,8 @@
 #ifndef FORTRAN_RUNTIME_DERIVED_H_
 #define FORTRAN_RUNTIME_DERIVED_H_
 
+#include "flang/Runtime/api-attrs.h"
+
 namespace Fortran::runtime::typeInfo {
 class DerivedType;
 }
@@ -21,21 +23,21 @@ class Terminator;
 
 // Perform default component initialization, allocate automatic components.
 // Returns a STAT= code (0 when all's well).
-int Initialize(const Descriptor &, const typeInfo::DerivedType &, Terminator &,
+RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &, Terminator &,
     bool hasStat = false, const Descriptor *errMsg = nullptr);
 
 // Call FINAL subroutines, if any
-void Finalize(
+RT_API_ATTRS void Finalize(
     const Descriptor &, const typeInfo::DerivedType &derived, Terminator *);
 
 // Call FINAL subroutines, deallocate allocatable & automatic components.
 // Does not deallocate the original descriptor.
-void Destroy(const Descriptor &, bool finalize, const typeInfo::DerivedType &,
+RT_API_ATTRS void Destroy(const Descriptor &, bool finalize, const typeInfo::DerivedType &,
     Terminator *);
 
 // Return true if the passed descriptor is for a derived type
 // entity that has a dynamic (allocatable, automatic) component.
-bool HasDynamicComponent(const Descriptor &);
+RT_API_ATTRS bool HasDynamicComponent(const Descriptor &);
 
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_DERIVED_H_
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index ab6460708e9b68f..043b73255ab6686 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -20,14 +20,16 @@
 
 namespace Fortran::runtime {
 
-Descriptor::Descriptor(const Descriptor &that) { *this = that; }
+RT_OFFLOAD_API_GROUP_BEGIN
 
-Descriptor &Descriptor::operator=(const Descriptor &that) {
+RT_API_ATTRS Descriptor::Descriptor(const Descriptor &that) { *this = that; }
+
+RT_API_ATTRS Descriptor &Descriptor::operator=(const Descriptor &that) {
   std::memcpy(this, &that, that.SizeInBytes());
   return *this;
 }
 
-void Descriptor::Establish(TypeCode t, std::size_t elementBytes, void *p,
+RT_API_ATTRS void Descriptor::Establish(TypeCode t, std::size_t elementBytes, void *p,
     int rank, const SubscriptValue *extent, ISO::CFI_attribute_t attribute,
     bool addendum) {
   Terminator terminator{__FILE__, __LINE__};
@@ -58,33 +60,33 @@ void Descriptor::Establish(TypeCode t, std::size_t elementBytes, void *p,
 
 namespace {
 template <TypeCategory CAT, int KIND> struct TypeSizeGetter {
-  constexpr std::size_t operator()() const {
+  constexpr RT_API_ATTRS std::size_t operator()() const {
     CppTypeFor<CAT, KIND> arr[2];
     return sizeof arr / 2;
   }
 };
 } // namespace
 
-std::size_t Descriptor::BytesFor(TypeCategory category, int kind) {
+RT_API_ATTRS std::size_t Descriptor::BytesFor(TypeCategory category, int kind) {
   Terminator terminator{__FILE__, __LINE__};
   return ApplyType<TypeSizeGetter, std::size_t>(category, kind, terminator);
 }
 
-void Descriptor::Establish(TypeCategory c, int kind, void *p, int rank,
+RT_API_ATTRS void Descriptor::Establish(TypeCategory c, int kind, void *p, int rank,
     const SubscriptValue *extent, ISO::CFI_attribute_t attribute,
     bool addendum) {
   Establish(TypeCode(c, kind), BytesFor(c, kind), p, rank, extent, attribute,
       addendum);
 }
 
-void Descriptor::Establish(int characterKind, std::size_t characters, void *p,
+RT_API_ATTRS void Descriptor::Establish(int characterKind, std::size_t characters, void *p,
     int rank, const SubscriptValue *extent, ISO::CFI_attribute_t attribute,
     bool addendum) {
   Establish(TypeCode{TypeCategory::Character, characterKind},
       characterKind * characters, p, rank, extent, attribute, addendum);
 }
 
-void Descriptor::Establish(const typeInfo::DerivedType &dt, void *p, int rank,
+RT_API_ATTRS void Descriptor::Establish(const typeInfo::DerivedType &dt, void *p, int rank,
     const SubscriptValue *extent, ISO::CFI_attribute_t attribute) {
   Establish(TypeCode{TypeCategory::Derived, 0}, dt.sizeInBytes(), p, rank,
       extent, attribute, true);
@@ -94,7 +96,7 @@ void Descriptor::Establish(const typeInfo::DerivedType &dt, void *p, int rank,
   new (a) DescriptorAddendum{&dt};
 }
 
-OwningPtr<Descriptor> Descriptor::Create(TypeCode t, std::size_t elementBytes,
+RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(TypeCode t, std::size_t elementBytes,
     void *p, int rank, const SubscriptValue *extent,
     ISO::CFI_attribute_t attribute, int derivedTypeLenParameters) {
   std::size_t bytes{SizeInBytes(rank, true, derivedTypeLenParameters)};
@@ -105,33 +107,33 @@ OwningPtr<Descriptor> Descriptor::Create(TypeCode t, std::size_t elementBytes,
   return OwningPtr<Descriptor>{result};
 }
 
-OwningPtr<Descriptor> Descriptor::Create(TypeCategory c, int kind, void *p,
+RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(TypeCategory c, int kind, void *p,
     int rank, const SubscriptValue *extent, ISO::CFI_attribute_t attribute) {
   return Create(
       TypeCode(c, kind), BytesFor(c, kind), p, rank, extent, attribute);
 }
 
-OwningPtr<Descriptor> Descriptor::Create(int characterKind,
+RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(int characterKind,
     SubscriptValue characters, void *p, int rank, const SubscriptValue *extent,
     ISO::CFI_attribute_t attribute) {
   return Create(TypeCode{TypeCategory::Character, characterKind},
       characterKind * characters, p, rank, extent, attribute);
 }
 
-OwningPtr<Descriptor> Descriptor::Create(const typeInfo::DerivedType &dt,
+RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(const typeInfo::DerivedType &dt,
     void *p, int rank, const SubscriptValue *extent,
     ISO::CFI_attribute_t attribute) {
   return Create(TypeCode{TypeCategory::Derived, 0}, dt.sizeInBytes(), p, rank,
       extent, attribute, dt.LenParameters());
 }
 
-std::size_t Descriptor::SizeInBytes() const {
+RT_API_ATTRS std::size_t Descriptor::SizeInBytes() const {
   const DescriptorAddendum *addendum{Addendum()};
   return sizeof *this - sizeof(Dimension) + raw_.rank * sizeof(Dimension) +
       (addendum ? addendum->SizeInBytes() : 0);
 }
 
-std::size_t Descriptor::Elements() const {
+RT_API_ATTRS std::size_t Descriptor::Elements() const {
   int n{rank()};
   std::size_t elements{1};
   for (int j{0}; j < n; ++j) {
@@ -140,7 +142,7 @@ std::size_t Descriptor::Elements() const {
   return elements;
 }
 
-int Descriptor::Allocate() {
+RT_API_ATTRS int Descriptor::Allocate() {
   std::size_t byteSize{Elements() * ElementBytes()};
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
@@ -162,7 +164,7 @@ int Descriptor::Allocate() {
   return 0;
 }
 
-int Descriptor::Destroy(
+RT_API_ATTRS int Descriptor::Destroy(
     bool finalize, bool destroyPointers, Terminator *terminator) {
   if (!destroyPointers && raw_.attribute == CFI_attribute_pointer) {
     return StatOk;
@@ -178,9 +180,9 @@ int Descriptor::Destroy(
   }
 }
 
-int Descriptor::Deallocate() { return ISO::CFI_deallocate(&raw_); }
+RT_API_ATTRS int Descriptor::Deallocate() { return ISO::CFI_deallocate(&raw_); }
 
-bool Descriptor::DecrementSubscripts(
+RT_API_ATTRS bool Descriptor::DecrementSubscripts(
     SubscriptValue *subscript, const int *permutation) const {
   for (int j{raw_.rank - 1}; j >= 0; --j) {
     int k{permutation ? permutation[j] : j};
@@ -193,7 +195,7 @@ bool Descriptor::DecrementSubscripts(
   return false;
 }
 
-std::size_t Descriptor::ZeroBasedElementNumber(
+RT_API_ATTRS std::size_t Descriptor::ZeroBasedElementNumber(
     const SubscriptValue *subscript, const int *permutation) const {
   std::size_t result{0};
   std::size_t coefficient{1};
@@ -206,7 +208,7 @@ std::size_t Descriptor::ZeroBasedElementNumber(
   return result;
 }
 
-bool Descriptor::EstablishPointerSection(const Descriptor &source,
+RT_API_ATTRS bool Descriptor::EstablishPointerSection(const Descriptor &source,
     const SubscriptValue *lower, const SubscriptValue *upper,
     const SubscriptValue *stride) {
   *this = source;
@@ -232,7 +234,7 @@ bool Descriptor::EstablishPointerSection(const Descriptor &source,
   return CFI_section(&raw_, &source.raw_, lower, upper, stride) == CFI_SUCCESS;
 }
 
-void Descriptor::Check() const {
+RT_API_ATTRS void Descriptor::Check() const {
   // TODO
 }
 
@@ -258,7 +260,7 @@ void Descriptor::Dump(FILE *f) const {
   }
 }
 
-DescriptorAddendum &DescriptorAddendum::operator=(
+RT_API_ATTRS DescriptorAddendum &DescriptorAddendum::operator=(
     const DescriptorAddendum &that) {
   derivedType_ = that.derivedType_;
   auto lenParms{that.LenParameters()};
@@ -268,11 +270,11 @@ DescriptorAddendum &DescriptorAddendum::operator=(
   return *this;
 }
 
-std::size_t DescriptorAddendum::SizeInBytes() const {
+RT_API_ATTRS std::size_t DescriptorAddendum::SizeInBytes() const {
   return SizeInBytes(LenParameters());
 }
 
-std::size_t DescriptorAddendum::LenParameters() const {
+RT_API_ATTRS std::size_t DescriptorAddendum::LenParameters() const {
   const auto *type{derivedType()};
   return type ? type->LenParameters() : 0;
 }
@@ -285,4 +287,7 @@ void DescriptorAddendum::Dump(FILE *f) const {
     std::fprintf(f, "  len[%zd] %jd\n", j, static_cast<std::intmax_t>(len_[j]));
   }
 }
+
+RT_OFFLOAD_API_GROUP_END
+
 } // namespace Fortran::runtime
diff --git a/flang/runtime/terminator.cpp b/flang/runtime/terminator.cpp
index f242ac6f2de2293..bd86912cd53b0bc 100644
--- a/flang/runtime/terminator.cpp
+++ b/flang/runtime/terminator.cpp
@@ -12,14 +12,8 @@
 
 namespace Fortran::runtime {
 
-[[noreturn]] void Terminator::Crash(const char *message, ...) const {
-  va_list ap;
-  va_start(ap, message);
-  CrashArgs(message, ap);
-  va_end(ap);
-}
-
-static void (*crashHandler)(const char *, int, const char *, va_list &){
+#if !defined(RT_DEVICE_COMPILATION)
+[[maybe_unused]] static void (*crashHandler)(const char *, int, const char *, va_list &){
     nullptr};
 
 void Terminator::RegisterCrashHandler(
@@ -27,11 +21,38 @@ void Terminator::RegisterCrashHandler(
   crashHandler = handler;
 }
 
-[[noreturn]] void Terminator::CrashArgs(
-    const char *message, va_list &ap) const {
+void Terminator::InvokeCrashHandler(const char *message, ...) const {
   if (crashHandler) {
+    va_list ap;
+    va_start(ap, message);
     crashHandler(sourceFileName_, sourceLine_, message, ap);
+    va_end(ap);
+  }
+}
+
+[[noreturn]] void Terminator::CrashArgs(
+    const char *message, va_list &ap) const {
+  CrashHeader();
+  std::vfprintf(stderr, message, ap);
+  va_end(ap);
+  CrashFooter();
+}
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS void Terminator::CrashHeader() const {
+#if defined(RT_DEVICE_COMPILATION)
+  std::printf("\nfatal Fortran runtime error");
+  if (sourceFileName_) {
+    std::printf("(%s", sourceFileName_);
+    if (sourceLine_) {
+      std::printf(":%d", sourceLine_);
+    }
+    std::printf(")");
   }
+  std::printf(": ");
+#else
   std::fputs("\nfatal Fortran runtime error", stderr);
   if (sourceFileName_) {
     std::fprintf(stderr, "(%s", sourceFileName_);
@@ -41,27 +62,49 @@ void Terminator::RegisterCrashHandler(
     fputc(')', stderr);
   }
   std::fputs(": ", stderr);
-  std::vfprintf(stderr, message, ap);
+#endif
+}
+
+[[noreturn]] RT_API_ATTRS void Terminator::CrashFooter() const {
+#if defined(RT_DEVICE_COMPILATION)
+  std::printf("\n");
+#else
   fputc('\n', stderr);
-  va_end(ap);
+  // FIXME: re-enable the flush along with the IO enabling.
   io::FlushOutputOnCrash(*this);
+#endif
   NotifyOtherImagesOfErrorTermination();
+#if defined(RT_DEVICE_COMPILATION)
+#if defined(__CUDACC__)
+  // NVCC supports __trap().
+  __trap();
+#elif defined(__clang__)
+  // Clang supports __builtin_trap().
+  __builtin_trap();
+#else
+#error "unsupported compiler"
+#endif
+#else
   std::abort();
+#endif
 }
 
-[[noreturn]] void Terminator::CheckFailed(
+[[noreturn]] RT_API_ATTRS void Terminator::CheckFailed(
     const char *predicate, const char *file, int line) const {
   Crash("Internal error: RUNTIME_CHECK(%s) failed at %s(%d)", predicate, file,
       line);
 }
 
-[[noreturn]] void Terminator::CheckFailed(const char *predicate) const {
+[[noreturn]] RT_API_ATTRS void Terminator::CheckFailed(const char *predicate) const {
   Crash("Internal error: RUNTIME_CHECK(%s) failed at %s(%d)", predicate,
       sourceFileName_, sourceLine_);
 }
 
 // TODO: These will be defined in the coarray runtime library
-void NotifyOtherImagesOfNormalEnd() {}
-void NotifyOtherImagesOfFailImageStatement() {}
-void NotifyOtherImagesOfErrorTermination() {}
+RT_API_ATTRS void NotifyOtherImagesOfNormalEnd() {}
+RT_API_ATTRS void NotifyOtherImagesOfFailImageStatement() {}
+RT_API_ATTRS void NotifyOtherImagesOfErrorTermination() {}
+
+RT_OFFLOAD_API_GROUP_END
+
 } // namespace Fortran::runtime
diff --git a/flang/runtime/terminator.h b/flang/runtime/terminator.h
index 84b4b1d79bf76e2..dc73407093fda23 100644
--- a/flang/runtime/terminator.h
+++ b/flang/runtime/terminator.h
@@ -13,6 +13,8 @@
 
 #include "flang/Runtime/api-attrs.h"
 #include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
 
 namespace Fortran::runtime {
 
@@ -20,26 +22,69 @@ namespace Fortran::runtime {
 // for errors detected in the runtime library
 class Terminator {
 public:
-  Terminator() {}
+  RT_API_ATTRS Terminator() {}
   Terminator(const Terminator &) = default;
   explicit RT_API_ATTRS Terminator(
       const char *sourceFileName, int sourceLine = 0)
       : sourceFileName_{sourceFileName}, sourceLine_{sourceLine} {}
 
-  const char *sourceFileName() const { return sourceFileName_; }
-  int sourceLine() const { return sourceLine_; }
+  RT_API_ATTRS const char *sourceFileName() const { return sourceFileName_; }
+  RT_API_ATTRS int sourceLine() const { return sourceLine_; }
 
-  void SetLocation(const char *sourceFileName = nullptr, int sourceLine = 0) {
+  RT_API_ATTRS void SetLocation(const char *sourceFileName = nullptr, int sourceLine = 0) {
     sourceFileName_ = sourceFileName;
     sourceLine_ = sourceLine;
   }
 
-  // CUDA_TODO: Clang for CUDA does not support varargs, though
-  // it compiles it with -fcuda-allow-variadic-functions.
-  // We can try to replace varargs functions with variadic templates.
-  [[noreturn]] RT_API_ATTRS void Crash(const char *message, ...) const;
-  [[noreturn]] RT_API_ATTRS void CrashArgs(
+  // Silence compiler warnings about the format string being
+  // non-literal. A more precise control would be
+  // __attribute__((format_arg(2))), but it requires the function
+  // to return 'char *', which does not work well with noreturn.
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-security"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-security"
+#endif
+
+  // Device offload compilers do not normally support varargs and va_list,
+  // so use C++ variadic templates to forward the crash arguments
+  // to regular printf for the device compilation.
+  // Try to keep the inline implementations as small as possible.
+  template <typename... Args>
+  [[noreturn]] RT_API_ATTRS const char *Crash(const char *message, Args... args) const  {
+#if !defined(RT_DEVICE_COMPILATION)
+    // Invoke handler set up by the test harness.
+    InvokeCrashHandler(message, args...);
+#endif
+    CrashHeader();
+    PrintCrashArgs(message, args...);
+    CrashFooter();
+  }
+
+  template <typename... Args>
+  RT_API_ATTRS void PrintCrashArgs(const char *message, Args... args) const {
+#if RT_DEVICE_COMPILATION
+    std::printf(message, args...);
+#else
+    std::fprintf(stderr, message, args...);
+#endif
+  }
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+  RT_API_ATTRS void CrashHeader() const;
+  [[noreturn]] RT_API_ATTRS void CrashFooter() const;
+#if !defined(RT_DEVICE_COMPILATION)
+  void InvokeCrashHandler(const char *message, ...) const;
+  [[noreturn]] void CrashArgs(
       const char *message, va_list &) const;
+#endif
   [[noreturn]] RT_API_ATTRS void CheckFailed(
       const char *predicate, const char *file, int line) const;
   [[noreturn]] RT_API_ATTRS void CheckFailed(const char *predicate) const;
@@ -66,13 +111,13 @@ class Terminator {
   else \
     Terminator{__FILE__, __LINE__}.CheckFailed(#pred)
 
-void NotifyOtherImagesOfNormalEnd();
-void NotifyOtherImagesOfFailImageStatement();
-void NotifyOtherImagesOfErrorTermination();
+RT_API_ATTRS void NotifyOtherImagesOfNormalEnd();
+RT_API_ATTRS void NotifyOtherImagesOfFailImageStatement();
+RT_API_ATTRS void NotifyOtherImagesOfErrorTermination();
 } // namespace Fortran::runtime
 
 namespace Fortran::runtime::io {
-void FlushOutputOnCrash(const Terminator &);
+RT_API_ATTRS void FlushOutputOnCrash(const Terminator &);
 }
 
 #endif // FORTRAN_RUNTIME_TERMINATOR_H_
diff --git a/flang/runtime/type-code.cpp b/flang/runtime/type-code.cpp
index b9ce519dc14941b..b9ef307835dfba3 100644
--- a/flang/runtime/type-code.cpp
+++ b/flang/runtime/type-code.cpp
@@ -10,7 +10,9 @@
 
 namespace Fortran::runtime {
 
-TypeCode::TypeCode(TypeCategory f, int kind) {
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS TypeCode::TypeCode(TypeCategory f, int kind) {
   switch (f) {
   case TypeCategory::Integer:
     switch (kind) {
@@ -110,7 +112,7 @@ TypeCode::TypeCode(TypeCategory f, int kind) {
   }
 }
 
-std::optional<std::pair<TypeCategory, int>>
+RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
 TypeCode::GetCategoryAndKind() const {
   switch (raw_) {
   case CFI_type_signed_char:
@@ -205,4 +207,7 @@ TypeCode::GetCategoryAndKind() const {
     return std::nullopt;
   }
 }
+
+RT_OFFLOAD_API_GROUP_END
+
 } // namespace Fortran::runtime
diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h
index 1f6c56742b6f7c2..7ed17d9ede93000 100644
--- a/flang/runtime/type-info.h
+++ b/flang/runtime/type-info.h
@@ -38,8 +38,8 @@ class Value {
     Explicit = 2,
     LenParameter = 3
   };
-  Genre genre() const { return genre_; }
-  std::optional<TypeParameterValue> GetValue(const Descriptor *) const;
+  RT_API_ATTRS Genre genre() const { return genre_; }
+  RT_API_ATTRS std::optional<TypeParameterValue> GetValue(const Descriptor *) const;
 
 private:
   Genre genre_{Genre::Explicit};
@@ -57,38 +57,38 @@ class Component {
     Automatic = 4
   };
 
-  const Descriptor &name() const { return name_.descriptor(); }
-  Genre genre() const { return genre_; }
-  TypeCategory category() const { return static_cast<TypeCategory>(category_); }
-  int kind() const { return kind_; }
-  int rank() const { return rank_; }
-  std::uint64_t offset() const { return offset_; }
-  const Value &characterLen() const { return characterLen_; }
-  const DerivedType *derivedType() const {
+  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS Genre genre() const { return genre_; }
+  RT_API_ATTRS TypeCategory category() const { return static_cast<TypeCategory>(category_); }
+  RT_API_ATTRS int kind() const { return kind_; }
+  RT_API_ATTRS int rank() const { return rank_; }
+  RT_API_ATTRS std::uint64_t offset() const { return offset_; }
+  const RT_API_ATTRS Value &characterLen() const { return characterLen_; }
+  const RT_API_ATTRS DerivedType *derivedType() const {
     return derivedType_.descriptor().OffsetElement<const DerivedType>();
   }
-  const Value *lenValue() const {
+  const RT_API_ATTRS Value *lenValue() const {
     return lenValue_.descriptor().OffsetElement<const Value>();
   }
-  const Value *bounds() const {
+  const RT_API_ATTRS Value *bounds() const {
     return bounds_.descriptor().OffsetElement<const Value>();
   }
-  const char *initialization() const { return initialization_; }
+  const RT_API_ATTRS char *initialization() const { return initialization_; }
 
-  std::size_t GetElementByteSize(const Descriptor &) const;
-  std::size_t GetElements(const Descriptor &) const;
+  RT_API_ATTRS std::size_t GetElementByteSize(const Descriptor &) const;
+  RT_API_ATTRS std::size_t GetElements(const Descriptor &) const;
 
   // For components that are descriptors, returns size of descriptor;
   // for Genre::Data, returns elemental byte size times element count.
-  std::size_t SizeInBytes(const Descriptor &) const;
+  RT_API_ATTRS std::size_t SizeInBytes(const Descriptor &) const;
 
   // Establishes a descriptor from this component description.
-  void EstablishDescriptor(
+  RT_API_ATTRS void EstablishDescriptor(
       Descriptor &, const Descriptor &container, Terminator &) const;
 
   // Creates a pointer descriptor from this component description, possibly
   // with subscripts
-  void CreatePointerDescriptor(Descriptor &, const Descriptor &container,
+  RT_API_ATTRS void CreatePointerDescriptor(Descriptor &, const Descriptor &container,
       Terminator &, const SubscriptValue * = nullptr) const;
 
   FILE *Dump(FILE * = stdout) const;
@@ -135,25 +135,25 @@ class SpecialBinding {
 
   // Special bindings can be created during execution to handle defined
   // I/O procedures that are not type-bound.
-  SpecialBinding(Which which, ProcedurePointer proc, std::uint8_t isArgDescSet,
+  RT_API_ATTRS  SpecialBinding(Which which, ProcedurePointer proc, std::uint8_t isArgDescSet,
       std::uint8_t isTypeBound, std::uint8_t isArgContiguousSet)
       : which_{which}, isArgDescriptorSet_{isArgDescSet},
         isTypeBound_{isTypeBound}, isArgContiguousSet_{isArgContiguousSet},
         proc_{proc} {}
 
-  static constexpr Which RankFinal(int rank) {
+  static constexpr RT_API_ATTRS Which RankFinal(int rank) {
     return static_cast<Which>(static_cast<int>(Which::ScalarFinal) + rank);
   }
 
-  Which which() const { return which_; }
-  bool IsArgDescriptor(int zeroBasedArg) const {
+  RT_API_ATTRS Which which() const { return which_; }
+  RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
-  bool isTypeBound() const { return isTypeBound_; }
-  bool IsArgContiguous(int zeroBasedArg) const {
+  RT_API_ATTRS bool isTypeBound() const { return isTypeBound_; }
+  RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
     return (isArgContiguousSet_ >> zeroBasedArg) & 1;
   }
-  template <typename PROC> PROC GetProc() const {
+  template <typename PROC> RT_API_ATTRS PROC GetProc() const {
     return reinterpret_cast<PROC>(proc_);
   }
 
@@ -200,36 +200,36 @@ class DerivedType {
 public:
   ~DerivedType(); // never defined
 
-  const Descriptor &binding() const { return binding_.descriptor(); }
-  const Descriptor &name() const { return name_.descriptor(); }
-  std::uint64_t sizeInBytes() const { return sizeInBytes_; }
-  const Descriptor &uninstatiated() const {
+  const RT_API_ATTRS Descriptor &binding() const { return binding_.descriptor(); }
+  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS std::uint64_t sizeInBytes() const { return sizeInBytes_; }
+  const RT_API_ATTRS Descriptor &uninstatiated() const {
     return uninstantiated_.descriptor();
   }
-  const Descriptor &kindParameter() const {
+  const RT_API_ATTRS Descriptor &kindParameter() const {
     return kindParameter_.descriptor();
   }
-  const Descriptor &lenParameterKind() const {
+  const RT_API_ATTRS Descriptor &lenParameterKind() const {
     return lenParameterKind_.descriptor();
   }
-  const Descriptor &component() const { return component_.descriptor(); }
-  const Descriptor &procPtr() const { return procPtr_.descriptor(); }
-  const Descriptor &special() const { return special_.descriptor(); }
-  bool hasParent() const { return hasParent_; }
-  bool noInitializationNeeded() const { return noInitializationNeeded_; }
-  bool noDestructionNeeded() const { return noDestructionNeeded_; }
-  bool noFinalizationNeeded() const { return noFinalizationNeeded_; }
+  const RT_API_ATTRS Descriptor &component() const { return component_.descriptor(); }
+  const RT_API_ATTRS Descriptor &procPtr() const { return procPtr_.descriptor(); }
+  const RT_API_ATTRS Descriptor &special() const { return special_.descriptor(); }
+  RT_API_ATTRS bool hasParent() const { return hasParent_; }
+  RT_API_ATTRS bool noInitializationNeeded() const { return noInitializationNeeded_; }
+  RT_API_ATTRS bool noDestructionNeeded() const { return noDestructionNeeded_; }
+  RT_API_ATTRS bool noFinalizationNeeded() const { return noFinalizationNeeded_; }
 
-  std::size_t LenParameters() const { return lenParameterKind().Elements(); }
+  RT_API_ATTRS std::size_t LenParameters() const { return lenParameterKind().Elements(); }
 
-  const DerivedType *GetParentType() const;
+  const RT_API_ATTRS DerivedType *GetParentType() const;
 
   // Finds a data component by name in this derived type or its ancestors.
-  const Component *FindDataComponent(
+  const RT_API_ATTRS Component *FindDataComponent(
       const char *name, std::size_t nameLen) const;
 
   // O(1) look-up of special procedure bindings
-  const SpecialBinding *FindSpecialBinding(SpecialBinding::Which which) const {
+  const RT_API_ATTRS SpecialBinding *FindSpecialBinding(SpecialBinding::Which which) const {
     auto bitIndex{static_cast<std::uint32_t>(which)};
     auto bit{std::uint32_t{1} << bitIndex};
     if (specialBitSet_ & bit) {



More information about the flang-commits mailing list