[llvm] [flang][rt] Add noinline attributes for CUDA compile path for successful compilation (PR #161760)

via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 2 18:36:05 PDT 2025


https://github.com/modiking created https://github.com/llvm/llvm-project/pull/161760

NVCC does more aggressive inlining than Clang/GCC causing the exported functions in extrema.cpp and findloc.cpp to become extremely large from function specializations leading to compilation timeouts. Marking the 2 functions in this change as noinline for NVCC alleviates this problem as it removes the worst of the cross-matrix argument specializations.

Also remove the workaround in https://github.com/llvm/llvm-project/pull/156542 that opted out findloc.cpp from the CUDA flang-rt build

Testing:
ninja flang-rt builds in ~30 minutes, these 2 files build in ~3 minutes

>From c95324337b4bb459760f46eaa4f4167b457cde0f Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Wed, 1 Oct 2025 20:04:16 -0700
Subject: [PATCH 1/2] enable full flang cuda build

---
 flang-rt/lib/runtime/CMakeLists.txt |  3 ---
 flang-rt/lib/runtime/extrema.cpp    | 10 +++++++---
 flang-rt/lib/runtime/findloc.cpp    | 11 +++++++----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..e8f70bd544e0b 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -178,9 +178,6 @@ endif ()
 if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
   set(sources ${gpu_sources})
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
-  # findloc.cpp has some issues with higher compute capability. Remove it
-  # from CUDA build until we can lower its memory footprint.
-  list(REMOVE_ITEM supported_sources findloc.cpp)
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 9846529665e8b..29f0e93e9631f 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,9 +397,13 @@ template <TypeCategory CAT, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
 struct DoPartialMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
-        const Descriptor &x, int kind, int dim, const Descriptor *mask,
-        bool back, Terminator &terminator) const {
+#if defined(__CUDACC__)
+    __attribute__((noinline))
+#endif
+    RT_API_ATTRS void
+    operator()(const char *intrinsic, Descriptor &result, const Descriptor &x,
+        int kind, int dim, const Descriptor *mask, bool back,
+        Terminator &terminator) const {
       DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
           intrinsic, result, x, kind, dim, mask, back, terminator);
     }
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index 5485f4b97bd2f..fe11386988476 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,10 +153,13 @@ template <TypeCategory CAT,
     class HELPER>
 struct NumericFindlocHelper {
   template <int KIND> struct Functor {
-    RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
-        Descriptor &result, const Descriptor &x, const Descriptor &target,
-        int kind, int dim, const Descriptor *mask, bool back,
-        Terminator &terminator) const {
+#if defined(__CUDACC__)
+    __attribute__((noinline))
+#endif
+    RT_API_ATTRS void
+    operator()(TypeCategory targetCat, int targetKind, Descriptor &result,
+        const Descriptor &x, const Descriptor &target, int kind, int dim,
+        const Descriptor *mask, bool back, Terminator &terminator) const {
       switch (targetCat) {
       case TypeCategory::Integer:
       case TypeCategory::Unsigned:

>From 65c8c54335b1f3ef061744e9cca90c96c7358ec0 Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Thu, 2 Oct 2025 18:28:38 -0700
Subject: [PATCH 2/2] add comments

---
 flang-rt/lib/runtime/extrema.cpp | 3 +++
 flang-rt/lib/runtime/findloc.cpp | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 29f0e93e9631f..3c9af38a64ddc 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,6 +397,9 @@ template <TypeCategory CAT, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
 struct DoPartialMaxOrMinLocHelper {
   template <int KIND> struct Functor {
+    // NVCC inlines more aggressively which causes too many specializations of
+    // this function to be inlined causing compiler timeouts. Set as
+    // noinline to allow compilation to complete.
 #if defined(__CUDACC__)
     __attribute__((noinline))
 #endif
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index fe11386988476..e1060bf82c333 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,6 +153,9 @@ template <TypeCategory CAT,
     class HELPER>
 struct NumericFindlocHelper {
   template <int KIND> struct Functor {
+    // NVCC inlines more aggressively which causes too many specializations of
+    // this function to be inlined causing compiler timeouts. Set as
+    // noinline to allow compilation to complete.
 #if defined(__CUDACC__)
     __attribute__((noinline))
 #endif



More information about the llvm-commits mailing list