[llvm] [flang][rt] Add noinline attributes for CUDA compile path for successful compilation (PR #161760)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 21:09:20 PDT 2025
https://github.com/modiking updated https://github.com/llvm/llvm-project/pull/161760
>From c95324337b4bb459760f46eaa4f4167b457cde0f Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Wed, 1 Oct 2025 20:04:16 -0700
Subject: [PATCH 1/3] enable full flang cuda build
---
flang-rt/lib/runtime/CMakeLists.txt | 3 ---
flang-rt/lib/runtime/extrema.cpp | 10 +++++++---
flang-rt/lib/runtime/findloc.cpp | 11 +++++++----
3 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..e8f70bd544e0b 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -178,9 +178,6 @@ endif ()
if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
set(sources ${gpu_sources})
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
- # findloc.cpp has some issues with higher compute capability. Remove it
- # from CUDA build until we can lower its memory footprint.
- list(REMOVE_ITEM supported_sources findloc.cpp)
set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 9846529665e8b..29f0e93e9631f 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,9 +397,13 @@ template <TypeCategory CAT, bool IS_MAX,
template <typename, bool, bool> class COMPARE>
struct DoPartialMaxOrMinLocHelper {
template <int KIND> struct Functor {
- RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
- const Descriptor &x, int kind, int dim, const Descriptor *mask,
- bool back, Terminator &terminator) const {
+#if defined(__CUDACC__)
+ __attribute__((noinline))
+#endif
+ RT_API_ATTRS void
+ operator()(const char *intrinsic, Descriptor &result, const Descriptor &x,
+ int kind, int dim, const Descriptor *mask, bool back,
+ Terminator &terminator) const {
DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
intrinsic, result, x, kind, dim, mask, back, terminator);
}
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index 5485f4b97bd2f..fe11386988476 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,10 +153,13 @@ template <TypeCategory CAT,
class HELPER>
struct NumericFindlocHelper {
template <int KIND> struct Functor {
- RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
- Descriptor &result, const Descriptor &x, const Descriptor &target,
- int kind, int dim, const Descriptor *mask, bool back,
- Terminator &terminator) const {
+#if defined(__CUDACC__)
+ __attribute__((noinline))
+#endif
+ RT_API_ATTRS void
+ operator()(TypeCategory targetCat, int targetKind, Descriptor &result,
+ const Descriptor &x, const Descriptor &target, int kind, int dim,
+ const Descriptor *mask, bool back, Terminator &terminator) const {
switch (targetCat) {
case TypeCategory::Integer:
case TypeCategory::Unsigned:
>From 65c8c54335b1f3ef061744e9cca90c96c7358ec0 Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Thu, 2 Oct 2025 18:28:38 -0700
Subject: [PATCH 2/3] add comments
---
flang-rt/lib/runtime/extrema.cpp | 3 +++
flang-rt/lib/runtime/findloc.cpp | 3 +++
2 files changed, 6 insertions(+)
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 29f0e93e9631f..3c9af38a64ddc 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,6 +397,9 @@ template <TypeCategory CAT, bool IS_MAX,
template <typename, bool, bool> class COMPARE>
struct DoPartialMaxOrMinLocHelper {
template <int KIND> struct Functor {
+ // NVCC inlines more aggressively which causes too many specializations of
+ // this function to be inlined causing compiler timeouts. Set as
+ // noinline to allow compilation to complete.
#if defined(__CUDACC__)
__attribute__((noinline))
#endif
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index fe11386988476..e1060bf82c333 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,6 +153,9 @@ template <TypeCategory CAT,
class HELPER>
struct NumericFindlocHelper {
template <int KIND> struct Functor {
+ // NVCC inlines more aggressively which causes too many specializations of
+ // this function to be inlined causing compiler timeouts. Set as
+ // noinline to allow compilation to complete.
#if defined(__CUDACC__)
__attribute__((noinline))
#endif
>From 7c715317555227eccb48faf96ead72464e0adfc9 Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Thu, 2 Oct 2025 21:07:56 -0700
Subject: [PATCH 3/3] use RT_DEVICE_NOINLINE and clang format
---
flang-rt/lib/runtime/extrema.cpp | 10 +++-------
flang-rt/lib/runtime/findloc.cpp | 11 ++++-------
2 files changed, 7 insertions(+), 14 deletions(-)
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 3c9af38a64ddc..c4575cced9017 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -400,13 +400,9 @@ struct DoPartialMaxOrMinLocHelper {
// NVCC inlines more aggressively which causes too many specializations of
// this function to be inlined causing compiler timeouts. Set as
// noinline to allow compilation to complete.
-#if defined(__CUDACC__)
- __attribute__((noinline))
-#endif
- RT_API_ATTRS void
- operator()(const char *intrinsic, Descriptor &result, const Descriptor &x,
- int kind, int dim, const Descriptor *mask, bool back,
- Terminator &terminator) const {
+ RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic,
+ Descriptor &result, const Descriptor &x, int kind, int dim,
+ const Descriptor *mask, bool back, Terminator &terminator) const {
DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
intrinsic, result, x, kind, dim, mask, back, terminator);
}
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index e1060bf82c333..b5031ec95508d 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -156,13 +156,10 @@ struct NumericFindlocHelper {
// NVCC inlines more aggressively which causes too many specializations of
// this function to be inlined causing compiler timeouts. Set as
// noinline to allow compilation to complete.
-#if defined(__CUDACC__)
- __attribute__((noinline))
-#endif
- RT_API_ATTRS void
- operator()(TypeCategory targetCat, int targetKind, Descriptor &result,
- const Descriptor &x, const Descriptor &target, int kind, int dim,
- const Descriptor *mask, bool back, Terminator &terminator) const {
+ RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat,
+ int targetKind, Descriptor &result, const Descriptor &x,
+ const Descriptor &target, int kind, int dim, const Descriptor *mask,
+ bool back, Terminator &terminator) const {
switch (targetCat) {
case TypeCategory::Integer:
case TypeCategory::Unsigned:
More information about the llvm-commits
mailing list