[flang-commits] [flang] [llvm] [flang-rt] Enable more runtime functions for the GPU target (PR #183649)

Joseph Huber via flang-commits flang-commits at lists.llvm.org
Thu Feb 26 17:50:23 PST 2026


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/183649

>From 8a836ad0f6211a0bfad3c7d3082ecbad6813d32c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Feb 2026 18:21:15 -0600
Subject: [PATCH 1/2] [flang-rt] Enable more runtime functions for the GPU
 target

Summary:
This enables primarily `stop.cpp` and `descriptor.cpp`. Requires a
little bit of wrangling to get it to compile. Unlike the CUDA build,
this build uses an in-tree libc++ configured for the GPU. This is
configured without thread support, environment, or filesystem, and it is
not POSIX at all. So, no mutexes, pthreads, or get/setenv.

I tested stop, but i don't know if it's actually legal to exit from
OpenMP offloading.
---
 flang-rt/include/flang-rt/runtime/lock.h      |  6 +--
 flang-rt/lib/runtime/CMakeLists.txt           |  3 ++
 flang-rt/lib/runtime/descriptor.cpp           | 11 ++--
 flang-rt/lib/runtime/environment.cpp          |  8 ++-
 flang-rt/lib/runtime/stop.cpp                 |  8 +++
 flang-rt/lib/runtime/terminator.cpp           |  3 ++
 flang/include/flang/Common/api-attrs.h        | 12 +++++
 .../fortran/target-descriptor-ops.f90         | 50 +++++++++++++++++++
 8 files changed, 91 insertions(+), 10 deletions(-)
 create mode 100644 offload/test/offloading/fortran/target-descriptor-ops.f90

diff --git a/flang-rt/include/flang-rt/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
index 7c88534245733..27927546ff95c 100644
--- a/flang-rt/include/flang-rt/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -16,7 +16,7 @@
 
 // Avoid <mutex> if possible to avoid introduction of C++ runtime
 // library dependence.
-#ifndef _WIN32
+#if !defined(_WIN32) && !RT_GPU_TARGET
 #define USE_PTHREADS 1
 #else
 #undef USE_PTHREADS
@@ -34,7 +34,7 @@ namespace Fortran::runtime {
 
 class Lock {
 public:
-#if RT_USE_PSEUDO_LOCK
+#if RT_USE_PSEUDO_LOCK || RT_GPU_TARGET
   // No lock implementation, e.g. for using together
   // with RT_USE_PSEUDO_FILE_UNIT.
   // The users of Lock class may use it under
@@ -88,7 +88,7 @@ class Lock {
   }
 
 private:
-#if RT_USE_PSEUDO_FILE_UNIT
+#if RT_USE_PSEUDO_FILE_UNIT || RT_GPU_TARGET
   // No state.
 #elif USE_PTHREADS
   pthread_mutex_t mutex_{};
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 9fa8376e9b99c..d18ce6caccaa3 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -109,9 +109,11 @@ set(gpu_sources
   copy.cpp
   derived-api.cpp
   derived.cpp
+  descriptor.cpp
   dot-product.cpp
   edit-output.cpp
   extrema.cpp
+  environment.cpp
   findloc.cpp
   format.cpp
   inquiry.cpp
@@ -127,6 +129,7 @@ set(gpu_sources
   product.cpp
   ragged.cpp
   stat.cpp
+  stop.cpp
   sum.cpp
   support.cpp
   terminator.cpp
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 04bbb3877a0d0..6e828858cd7ec 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -8,10 +8,10 @@
 
 #include "flang-rt/runtime/descriptor.h"
 #include "ISO_Fortran_util.h"
-#include "memory.h"
 #include "flang-rt/runtime/allocator-registry.h"
 #include "flang-rt/runtime/derived.h"
 #include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
 #include "flang-rt/runtime/stat.h"
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/type-info.h"
@@ -155,9 +155,10 @@ RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(
 
 RT_API_ATTRS std::size_t Descriptor::SizeInBytes() const {
   const DescriptorAddendum *addendum{Addendum()};
-  std::size_t bytes{ sizeof *this - sizeof(Dimension) + raw_.rank * sizeof(Dimension) +
-      (addendum ? addendum->SizeInBytes() : 0)};
-  assert (bytes <= MaxDescriptorSizeInBytes(raw_.rank,addendum) && "Descriptor must fit compiler-allocated space");
+  std::size_t bytes{sizeof *this - sizeof(Dimension) +
+      raw_.rank * sizeof(Dimension) + (addendum ? addendum->SizeInBytes() : 0)};
+  assert(bytes <= MaxDescriptorSizeInBytes(raw_.rank, addendum) &&
+      "Descriptor must fit compiler-allocated space");
   return bytes;
 }
 
@@ -465,7 +466,7 @@ void Descriptor::Dump(FILE *f, bool dumpRawType) const {
     std::fprintf(f, "         sm          %jd\n",
         static_cast<std::intmax_t>(raw_.dim[j].sm));
   }
-  if (const DescriptorAddendum * addendum{Addendum()}) {
+  if (const DescriptorAddendum *addendum{Addendum()}) {
     addendum->Dump(f);
   }
 }
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index ae4d6d305f409..53e13cd929bf8 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -8,7 +8,7 @@
 
 #include "flang-rt/runtime/environment.h"
 #include "environment-default-list.h"
-#include "memory.h"
+#include "flang-rt/runtime/memory.h"
 #include "flang-rt/runtime/tools.h"
 #include <cstdio>
 #include <cstdlib>
@@ -19,10 +19,11 @@
 #ifdef _MSC_VER
 extern char **_environ;
 #endif
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || RT_GPU_TARGET
 // FreeBSD has environ in crt rather than libc. Using "extern char** environ"
 // in the code of a shared library makes it fail to link with -Wl,--no-undefined
 // See https://reviews.freebsd.org/D30842#840642
+// GPU targets do not provide environ.
 #else
 extern char **environ;
 #endif
@@ -51,6 +52,8 @@ static void (*PostConfigEnvCallback[ExecutionEnvironment::nConfigEnvCallback])(
     int, const char *[], const char *[], const EnvironmentDefaultList *){
     nullptr};
 
+// No environment support on the GPU.
+#if !RT_GPU_TARGET
 static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   if (!envDefaults) {
     return;
@@ -314,6 +317,7 @@ std::int32_t ExecutionEnvironment::UnsetEnv(
 
   return status;
 }
+#endif
 
 extern "C" {
 
diff --git a/flang-rt/lib/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp
index 75fa64c4c0039..5abb80af7e66d 100644
--- a/flang-rt/lib/runtime/stop.cpp
+++ b/flang-rt/lib/runtime/stop.cpp
@@ -24,10 +24,14 @@
 extern "C" {
 
 [[maybe_unused]] static void DescribeIEEESignaledExceptions() {
+#if defined(RT_DEVICE_COMPILATION) || RT_GPU_TARGET
+  unsigned excepts{}; // No fenv support on the device.
+#else
 #ifdef fetestexcept // a macro in some environments; omit std::
   auto excepts{fetestexcept(FE_ALL_EXCEPT)};
 #else
   auto excepts{std::fetestexcept(FE_ALL_EXCEPT)};
+#endif
 #endif
   if (excepts) {
     std::fputs("IEEE arithmetic exceptions signaled:", stderr);
@@ -61,8 +65,10 @@ extern "C" {
 }
 
 static void CloseAllExternalUnits(const char *why) {
+#if !RT_GPU_TARGET
   Fortran::runtime::io::IoErrorHandler handler{why};
   Fortran::runtime::io::ExternalFileUnit::CloseAll(handler);
+#endif
 }
 
 [[noreturn]] RT_API_ATTRS void RTNAME(StopStatement)(
@@ -134,6 +140,7 @@ static void CloseAllExternalUnits(const char *why) {
 #endif
 }
 
+#if !RT_GPU_TARGET
 static bool StartPause() {
   if (Fortran::runtime::io::IsATerminal(0)) {
     Fortran::runtime::io::IoErrorHandler handler{"PAUSE statement"};
@@ -173,6 +180,7 @@ void RTNAME(PauseStatementText)(const char *code, std::size_t length) {
     EndPause();
   }
 }
+#endif
 
 [[noreturn]] void RTNAME(FailImageStatement)() {
   CloseAllExternalUnits("FAIL IMAGE statement");
diff --git a/flang-rt/lib/runtime/terminator.cpp b/flang-rt/lib/runtime/terminator.cpp
index e8d64223919e4..2c06c8de74d0f 100644
--- a/flang-rt/lib/runtime/terminator.cpp
+++ b/flang-rt/lib/runtime/terminator.cpp
@@ -70,8 +70,11 @@ RT_API_ATTRS void Terminator::CrashHeader() const {
   std::printf("\n");
 #else
   fputc('\n', stderr);
+  // TODO: This should flush the buffers through the RPC interface.
+#if !RT_GPU_TARGET
   // FIXME: re-enable the flush along with the IO enabling.
   io::FlushOutputOnCrash(*this);
+#endif
 #endif
   NotifyOtherImagesOfErrorTermination(EXIT_FAILURE);
 #if defined(RT_DEVICE_COMPILATION)
diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h
index fd524ee34ccff..efb495e7f132e 100644
--- a/flang/include/flang/Common/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -133,6 +133,18 @@
 #undef RT_DEVICE_COMPILATION
 #endif
 
+/*
+ * RT_GPU_TARGET is defined when compiling natively for a GPU
+ * target (AMDGPU or NVPTX) using a GPU-hosted libc/libc++. This is
+ * distinct from RT_DEVICE_COMPILATION which covers CUDA and OpenMP
+ * offload paths that use separate host/device compilation.
+ */
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+#define RT_GPU_TARGET 1
+#else
+#undef RT_GPU_TARGET
+#endif
+
 /*
  * Recurrence in the call graph prevents computing minimal stack size
  * required for a kernel execution. This macro can be used to disable
diff --git a/offload/test/offloading/fortran/target-descriptor-ops.f90 b/offload/test/offloading/fortran/target-descriptor-ops.f90
new file mode 100644
index 0000000000000..43dae03b33995
--- /dev/null
+++ b/offload/test/offloading/fortran/target-descriptor-ops.f90
@@ -0,0 +1,50 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  implicit none
+  integer :: result
+
+  ! CHECK: 100
+  result = 0
+  !$omp target map(from: result)
+  block
+    integer, allocatable :: arr(:)
+    integer :: i
+    allocate(arr(4))
+    do i = 1, 4
+      arr(i) = i * 10
+    end do
+    result = arr(1) + arr(2) + arr(3) + arr(4)
+    deallocate(arr)
+  end block
+  !$omp end target
+  print *, result
+
+  ! CHECK: 21
+  result = 0
+  !$omp target map(from: result)
+  block
+    integer, allocatable :: mat(:,:)
+    allocate(mat(2, 3))
+    mat(1,1) = 1; mat(2,1) = 2
+    mat(1,2) = 3; mat(2,2) = 4
+    mat(1,3) = 5; mat(2,3) = 6
+    result = mat(1,1) + mat(2,1) + mat(1,2) + mat(2,2) + mat(1,3) + mat(2,3)
+    deallocate(mat)
+  end block
+  !$omp end target
+  print *, result
+
+  ! CHECK: 17
+  result = 0
+  !$omp target map(from: result)
+  block
+    integer, allocatable :: arr(:)
+    allocate(arr(8))
+    result = size(arr) + lbound(arr, 1) + ubound(arr, 1)
+    deallocate(arr)
+  end block
+  !$omp end target
+  print *, result
+end program main

>From 5db5e3fcad80b7c6026d2c437fdadb8996b49ca6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Feb 2026 19:42:59 -0600
Subject: [PATCH 2/2] comments

---
 flang-rt/include/flang-rt/runtime/lock.h  | 8 ++++----
 flang-rt/include/flang-rt/runtime/tools.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
index 27927546ff95c..8285d36de4216 100644
--- a/flang-rt/include/flang-rt/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -16,13 +16,13 @@
 
 // Avoid <mutex> if possible to avoid introduction of C++ runtime
 // library dependence.
-#if !defined(_WIN32) && !RT_GPU_TARGET
+#if !defined(_WIN32)
 #define USE_PTHREADS 1
 #else
 #undef USE_PTHREADS
 #endif
 
-#if USE_PTHREADS
+#if USE_PTHREADS && !RT_GPU_TARGET
 #include <pthread.h>
 #elif defined(_WIN32)
 #include "flang/Common/windows-include.h"
@@ -34,7 +34,7 @@ namespace Fortran::runtime {
 
 class Lock {
 public:
-#if RT_USE_PSEUDO_LOCK || RT_GPU_TARGET
+#if RT_USE_PSEUDO_LOCK
   // No lock implementation, e.g. for using together
   // with RT_USE_PSEUDO_FILE_UNIT.
   // The users of Lock class may use it under
@@ -88,7 +88,7 @@ class Lock {
   }
 
 private:
-#if RT_USE_PSEUDO_FILE_UNIT || RT_GPU_TARGET
+#if RT_USE_PSEUDO_FILE_UNIT
   // No state.
 #elif USE_PTHREADS
   pthread_mutex_t mutex_{};
diff --git a/flang-rt/include/flang-rt/runtime/tools.h b/flang-rt/include/flang-rt/runtime/tools.h
index 1939c4d907be4..a45c2ac98f2fa 100644
--- a/flang-rt/include/flang-rt/runtime/tools.h
+++ b/flang-rt/include/flang-rt/runtime/tools.h
@@ -35,7 +35,7 @@
 #define RT_PRETTY_FUNCTION __func__
 #endif
 
-#if defined(RT_DEVICE_COMPILATION)
+#if defined(RT_DEVICE_COMPILATION) || RT_GPU_TARGET
 // Use the pseudo lock and pseudo file unit implementations
 // for the device.
 #define RT_USE_PSEUDO_LOCK 1



More information about the flang-commits mailing list