[flang-commits] [flang] [llvm] [flang-rt] Enable more runtime functions for the GPU target (PR #183649)
Joseph Huber via flang-commits
flang-commits at lists.llvm.org
Fri Feb 27 06:00:34 PST 2026
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/183649
>From 8a836ad0f6211a0bfad3c7d3082ecbad6813d32c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Feb 2026 18:21:15 -0600
Subject: [PATCH 1/3] [flang-rt] Enable more runtime functions for the GPU
target
Summary:
This enables primarily `stop.cpp` and `descriptor.cpp`. Requires a
little bit of wrangling to get it to compile. Unlike the CUDA build,
this build uses an in-tree libc++ configured for the GPU. This is
configured without thread support, environment, or filesystem, and it is
not POSIX at all. So, no mutexes, pthreads, or get/setenv.
I tested stop, but i don't know if it's actually legal to exit from
OpenMP offloading.
---
flang-rt/include/flang-rt/runtime/lock.h | 6 +--
flang-rt/lib/runtime/CMakeLists.txt | 3 ++
flang-rt/lib/runtime/descriptor.cpp | 11 ++--
flang-rt/lib/runtime/environment.cpp | 8 ++-
flang-rt/lib/runtime/stop.cpp | 8 +++
flang-rt/lib/runtime/terminator.cpp | 3 ++
flang/include/flang/Common/api-attrs.h | 12 +++++
.../fortran/target-descriptor-ops.f90 | 50 +++++++++++++++++++
8 files changed, 91 insertions(+), 10 deletions(-)
create mode 100644 offload/test/offloading/fortran/target-descriptor-ops.f90
diff --git a/flang-rt/include/flang-rt/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
index 7c88534245733..27927546ff95c 100644
--- a/flang-rt/include/flang-rt/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -16,7 +16,7 @@
// Avoid <mutex> if possible to avoid introduction of C++ runtime
// library dependence.
-#ifndef _WIN32
+#if !defined(_WIN32) && !RT_GPU_TARGET
#define USE_PTHREADS 1
#else
#undef USE_PTHREADS
@@ -34,7 +34,7 @@ namespace Fortran::runtime {
class Lock {
public:
-#if RT_USE_PSEUDO_LOCK
+#if RT_USE_PSEUDO_LOCK || RT_GPU_TARGET
// No lock implementation, e.g. for using together
// with RT_USE_PSEUDO_FILE_UNIT.
// The users of Lock class may use it under
@@ -88,7 +88,7 @@ class Lock {
}
private:
-#if RT_USE_PSEUDO_FILE_UNIT
+#if RT_USE_PSEUDO_FILE_UNIT || RT_GPU_TARGET
// No state.
#elif USE_PTHREADS
pthread_mutex_t mutex_{};
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 9fa8376e9b99c..d18ce6caccaa3 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -109,9 +109,11 @@ set(gpu_sources
copy.cpp
derived-api.cpp
derived.cpp
+ descriptor.cpp
dot-product.cpp
edit-output.cpp
extrema.cpp
+ environment.cpp
findloc.cpp
format.cpp
inquiry.cpp
@@ -127,6 +129,7 @@ set(gpu_sources
product.cpp
ragged.cpp
stat.cpp
+ stop.cpp
sum.cpp
support.cpp
terminator.cpp
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 04bbb3877a0d0..6e828858cd7ec 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -8,10 +8,10 @@
#include "flang-rt/runtime/descriptor.h"
#include "ISO_Fortran_util.h"
-#include "memory.h"
#include "flang-rt/runtime/allocator-registry.h"
#include "flang-rt/runtime/derived.h"
#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
#include "flang-rt/runtime/stat.h"
#include "flang-rt/runtime/terminator.h"
#include "flang-rt/runtime/type-info.h"
@@ -155,9 +155,10 @@ RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(
RT_API_ATTRS std::size_t Descriptor::SizeInBytes() const {
const DescriptorAddendum *addendum{Addendum()};
- std::size_t bytes{ sizeof *this - sizeof(Dimension) + raw_.rank * sizeof(Dimension) +
- (addendum ? addendum->SizeInBytes() : 0)};
- assert (bytes <= MaxDescriptorSizeInBytes(raw_.rank,addendum) && "Descriptor must fit compiler-allocated space");
+ std::size_t bytes{sizeof *this - sizeof(Dimension) +
+ raw_.rank * sizeof(Dimension) + (addendum ? addendum->SizeInBytes() : 0)};
+ assert(bytes <= MaxDescriptorSizeInBytes(raw_.rank, addendum) &&
+ "Descriptor must fit compiler-allocated space");
return bytes;
}
@@ -465,7 +466,7 @@ void Descriptor::Dump(FILE *f, bool dumpRawType) const {
std::fprintf(f, " sm %jd\n",
static_cast<std::intmax_t>(raw_.dim[j].sm));
}
- if (const DescriptorAddendum * addendum{Addendum()}) {
+ if (const DescriptorAddendum *addendum{Addendum()}) {
addendum->Dump(f);
}
}
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index ae4d6d305f409..53e13cd929bf8 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -8,7 +8,7 @@
#include "flang-rt/runtime/environment.h"
#include "environment-default-list.h"
-#include "memory.h"
+#include "flang-rt/runtime/memory.h"
#include "flang-rt/runtime/tools.h"
#include <cstdio>
#include <cstdlib>
@@ -19,10 +19,11 @@
#ifdef _MSC_VER
extern char **_environ;
#endif
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || RT_GPU_TARGET
// FreeBSD has environ in crt rather than libc. Using "extern char** environ"
// in the code of a shared library makes it fail to link with -Wl,--no-undefined
// See https://reviews.freebsd.org/D30842#840642
+// GPU targets do not provide environ.
#else
extern char **environ;
#endif
@@ -51,6 +52,8 @@ static void (*PostConfigEnvCallback[ExecutionEnvironment::nConfigEnvCallback])(
int, const char *[], const char *[], const EnvironmentDefaultList *){
nullptr};
+// No environment support on the GPU.
+#if !RT_GPU_TARGET
static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
if (!envDefaults) {
return;
@@ -314,6 +317,7 @@ std::int32_t ExecutionEnvironment::UnsetEnv(
return status;
}
+#endif
extern "C" {
diff --git a/flang-rt/lib/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp
index 75fa64c4c0039..5abb80af7e66d 100644
--- a/flang-rt/lib/runtime/stop.cpp
+++ b/flang-rt/lib/runtime/stop.cpp
@@ -24,10 +24,14 @@
extern "C" {
[[maybe_unused]] static void DescribeIEEESignaledExceptions() {
+#if defined(RT_DEVICE_COMPILATION) || RT_GPU_TARGET
+ unsigned excepts{}; // No fenv support on the device.
+#else
#ifdef fetestexcept // a macro in some environments; omit std::
auto excepts{fetestexcept(FE_ALL_EXCEPT)};
#else
auto excepts{std::fetestexcept(FE_ALL_EXCEPT)};
+#endif
#endif
if (excepts) {
std::fputs("IEEE arithmetic exceptions signaled:", stderr);
@@ -61,8 +65,10 @@ extern "C" {
}
static void CloseAllExternalUnits(const char *why) {
+#if !RT_GPU_TARGET
Fortran::runtime::io::IoErrorHandler handler{why};
Fortran::runtime::io::ExternalFileUnit::CloseAll(handler);
+#endif
}
[[noreturn]] RT_API_ATTRS void RTNAME(StopStatement)(
@@ -134,6 +140,7 @@ static void CloseAllExternalUnits(const char *why) {
#endif
}
+#if !RT_GPU_TARGET
static bool StartPause() {
if (Fortran::runtime::io::IsATerminal(0)) {
Fortran::runtime::io::IoErrorHandler handler{"PAUSE statement"};
@@ -173,6 +180,7 @@ void RTNAME(PauseStatementText)(const char *code, std::size_t length) {
EndPause();
}
}
+#endif
[[noreturn]] void RTNAME(FailImageStatement)() {
CloseAllExternalUnits("FAIL IMAGE statement");
diff --git a/flang-rt/lib/runtime/terminator.cpp b/flang-rt/lib/runtime/terminator.cpp
index e8d64223919e4..2c06c8de74d0f 100644
--- a/flang-rt/lib/runtime/terminator.cpp
+++ b/flang-rt/lib/runtime/terminator.cpp
@@ -70,8 +70,11 @@ RT_API_ATTRS void Terminator::CrashHeader() const {
std::printf("\n");
#else
fputc('\n', stderr);
+ // TODO: This should flush the buffers through the RPC interface.
+#if !RT_GPU_TARGET
// FIXME: re-enable the flush along with the IO enabling.
io::FlushOutputOnCrash(*this);
+#endif
#endif
NotifyOtherImagesOfErrorTermination(EXIT_FAILURE);
#if defined(RT_DEVICE_COMPILATION)
diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h
index fd524ee34ccff..efb495e7f132e 100644
--- a/flang/include/flang/Common/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -133,6 +133,18 @@
#undef RT_DEVICE_COMPILATION
#endif
+/*
+ * RT_GPU_TARGET is defined when compiling natively for a GPU
+ * target (AMDGPU or NVPTX) using a GPU-hosted libc/libc++. This is
+ * distinct from RT_DEVICE_COMPILATION which covers CUDA and OpenMP
+ * offload paths that use separate host/device compilation.
+ */
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+#define RT_GPU_TARGET 1
+#else
+#undef RT_GPU_TARGET
+#endif
+
/*
* Recurrence in the call graph prevents computing minimal stack size
* required for a kernel execution. This macro can be used to disable
diff --git a/offload/test/offloading/fortran/target-descriptor-ops.f90 b/offload/test/offloading/fortran/target-descriptor-ops.f90
new file mode 100644
index 0000000000000..43dae03b33995
--- /dev/null
+++ b/offload/test/offloading/fortran/target-descriptor-ops.f90
@@ -0,0 +1,50 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+ implicit none
+ integer :: result
+
+ ! CHECK: 100
+ result = 0
+ !$omp target map(from: result)
+ block
+ integer, allocatable :: arr(:)
+ integer :: i
+ allocate(arr(4))
+ do i = 1, 4
+ arr(i) = i * 10
+ end do
+ result = arr(1) + arr(2) + arr(3) + arr(4)
+ deallocate(arr)
+ end block
+ !$omp end target
+ print *, result
+
+ ! CHECK: 21
+ result = 0
+ !$omp target map(from: result)
+ block
+ integer, allocatable :: mat(:,:)
+ allocate(mat(2, 3))
+ mat(1,1) = 1; mat(2,1) = 2
+ mat(1,2) = 3; mat(2,2) = 4
+ mat(1,3) = 5; mat(2,3) = 6
+ result = mat(1,1) + mat(2,1) + mat(1,2) + mat(2,2) + mat(1,3) + mat(2,3)
+ deallocate(mat)
+ end block
+ !$omp end target
+ print *, result
+
+ ! CHECK: 17
+ result = 0
+ !$omp target map(from: result)
+ block
+ integer, allocatable :: arr(:)
+ allocate(arr(8))
+ result = size(arr) + lbound(arr, 1) + ubound(arr, 1)
+ deallocate(arr)
+ end block
+ !$omp end target
+ print *, result
+end program main
>From 5db5e3fcad80b7c6026d2c437fdadb8996b49ca6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Feb 2026 19:42:59 -0600
Subject: [PATCH 2/3] comments
---
flang-rt/include/flang-rt/runtime/lock.h | 8 ++++----
flang-rt/include/flang-rt/runtime/tools.h | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/flang-rt/include/flang-rt/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
index 27927546ff95c..8285d36de4216 100644
--- a/flang-rt/include/flang-rt/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -16,13 +16,13 @@
// Avoid <mutex> if possible to avoid introduction of C++ runtime
// library dependence.
-#if !defined(_WIN32) && !RT_GPU_TARGET
+#if !defined(_WIN32)
#define USE_PTHREADS 1
#else
#undef USE_PTHREADS
#endif
-#if USE_PTHREADS
+#if USE_PTHREADS && !RT_GPU_TARGET
#include <pthread.h>
#elif defined(_WIN32)
#include "flang/Common/windows-include.h"
@@ -34,7 +34,7 @@ namespace Fortran::runtime {
class Lock {
public:
-#if RT_USE_PSEUDO_LOCK || RT_GPU_TARGET
+#if RT_USE_PSEUDO_LOCK
// No lock implementation, e.g. for using together
// with RT_USE_PSEUDO_FILE_UNIT.
// The users of Lock class may use it under
@@ -88,7 +88,7 @@ class Lock {
}
private:
-#if RT_USE_PSEUDO_FILE_UNIT || RT_GPU_TARGET
+#if RT_USE_PSEUDO_FILE_UNIT
// No state.
#elif USE_PTHREADS
pthread_mutex_t mutex_{};
diff --git a/flang-rt/include/flang-rt/runtime/tools.h b/flang-rt/include/flang-rt/runtime/tools.h
index 1939c4d907be4..a45c2ac98f2fa 100644
--- a/flang-rt/include/flang-rt/runtime/tools.h
+++ b/flang-rt/include/flang-rt/runtime/tools.h
@@ -35,7 +35,7 @@
#define RT_PRETTY_FUNCTION __func__
#endif
-#if defined(RT_DEVICE_COMPILATION)
+#if defined(RT_DEVICE_COMPILATION) || RT_GPU_TARGET
// Use the pseudo lock and pseudo file unit implementations
// for the device.
#define RT_USE_PSEUDO_LOCK 1
>From f70094c6235f252f2f4b9eaae5aa642846751667 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 27 Feb 2026 08:00:23 -0600
Subject: [PATCH 3/3] fix
---
flang-rt/include/flang-rt/runtime/lock.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/flang-rt/include/flang-rt/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
index 8285d36de4216..4c6c58eb28b72 100644
--- a/flang-rt/include/flang-rt/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -16,13 +16,13 @@
// Avoid <mutex> if possible to avoid introduction of C++ runtime
// library dependence.
-#if !defined(_WIN32)
+#if !defined(_WIN32) || RT_GPU_TARGET
#define USE_PTHREADS 1
#else
#undef USE_PTHREADS
#endif
-#if USE_PTHREADS && !RT_GPU_TARGET
+#if USE_PTHREADS
#include <pthread.h>
#elif defined(_WIN32)
#include "flang/Common/windows-include.h"
More information about the flang-commits
mailing list