[flang-commits] [flang] [flang][cuda] Allow to set the stack limit size (PR #124859)

Tue Jan 28 16:09:50 PST 2025

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/124859

>From 3641398393de637b3d172bdae52ab127368745e2 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 28 Jan 2025 15:52:08 -0800
Subject: [PATCH 1/2] [flang][cuda] Allow to set the stack limit size

---
 flang/CMakeLists.txt                          | 13 +++++-----
 .../flang/Optimizer/Builder/Runtime/Main.h    |  3 ++-
 flang/include/flang/Runtime/CUDA/init.h       | 20 +++++++++++++++
 flang/lib/Lower/Bridge.cpp                    |  4 ++-
 flang/lib/Optimizer/Builder/Runtime/Main.cpp  | 15 ++++++++++-
 flang/runtime/CUDA/CMakeLists.txt             |  1 +
 flang/runtime/CUDA/init.cpp                   | 25 +++++++++++++++++++
 flang/runtime/environment.cpp                 | 11 ++++++++
 flang/runtime/environment.h                   |  3 +++
 9 files changed, 86 insertions(+), 9 deletions(-)
 create mode 100644 flang/include/flang/Runtime/CUDA/init.h
 create mode 100644 flang/runtime/CUDA/init.cpp

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index b619553ef83021..fb7ab4759ad37e 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -471,6 +471,13 @@ if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
 endif()
 
+option(FLANG_CUF_RUNTIME
+  "Compile CUDA Fortran runtime sources" OFF)
+if (FLANG_CUF_RUNTIME)
+  find_package(CUDAToolkit REQUIRED)
+  add_compile_definitions(FLANG_CUDA_SUPPORT=1)
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(cmake/modules)
@@ -481,12 +488,6 @@ if (FLANG_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
 
-option(FLANG_CUF_RUNTIME
-  "Compile CUDA Fortran runtime sources" OFF)
-if (FLANG_CUF_RUNTIME)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
 add_subdirectory(runtime)
 
 if (LLVM_INCLUDE_EXAMPLES)
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
index e4c5dc914c700b..a0586deade42aa 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Main.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -24,7 +24,8 @@ class GlobalOp;
 namespace fir::runtime {
 
 void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
-             const std::vector<Fortran::lower::EnvironmentDefault> &defs);
+             const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+             bool initCuda = false);
 }
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Runtime/CUDA/init.h b/flang/include/flang/Runtime/CUDA/init.h
new file mode 100644
index 00000000000000..24bc6838227208
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/init.h
@@ -0,0 +1,20 @@
+//===-- include/flang/Runtime/CUDA/init.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_INIT_H_
+#define FORTRAN_RUNTIME_CUDA_INIT_H_
+
+#include "common.h"
+#include "flang/Runtime/entry-names.h"
+
+extern "C" {
+
+void RTDECL(CUFInit)();
+}
+
+#endif // FORTRAN_RUNTIME_CUDA_INIT_H_
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index d92dc0cf9abd62..ff80826216e4f5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -459,7 +459,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (hasMainProgram)
       createGlobalOutsideOfFunctionLowering([&]() {
         fir::runtime::genMain(*builder, toLocation(),
-                              bridge.getEnvironmentDefaults());
+                              bridge.getEnvironmentDefaults(),
+                              getFoldingContext().languageFeatures().IsEnabled(
+                                  Fortran::common::LanguageFeature::CUDA));
       });
 
     finalizeOpenACCLowering();
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
index ab3c4ca81314ce..5156fd54020777 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -16,13 +16,17 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
+#ifdef FLANG_CUDA_SUPPORT
+#include "flang/Runtime/CUDA/init.h"
+#endif
 
 using namespace Fortran::runtime;
 
 /// Create a `int main(...)` that calls the Fortran entry point
 void fir::runtime::genMain(
     fir::FirOpBuilder &builder, mlir::Location loc,
-    const std::vector<Fortran::lower::EnvironmentDefault> &defs) {
+    const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+    bool initCuda) {
   auto *context = builder.getContext();
   auto argcTy = builder.getDefaultIntegerType();
   auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
@@ -61,6 +65,15 @@ void fir::runtime::genMain(
   args.push_back(env);
 
   builder.create<fir::CallOp>(loc, startFn, args);
+
+#ifdef FLANG_CUDA_SUPPORT
+  if (initCuda) {
+    auto initFn = builder.createFunction(
+        loc, RTNAME_STRING(CUFInit), mlir::FunctionType::get(context, {}, {}));
+    builder.create<fir::CallOp>(loc, initFn);
+  }
+#endif
+
   builder.create<fir::CallOp>(loc, qqMainFn);
   builder.create<fir::CallOp>(loc, stopFn);
 
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 23e01da72eded1..bfbae58086c1fd 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -17,6 +17,7 @@ add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
   allocatable.cpp
   descriptor.cpp
+  init.cpp
   kernel.cpp
   memmove-function.cpp
   memory.cpp
diff --git a/flang/runtime/CUDA/init.cpp b/flang/runtime/CUDA/init.cpp
new file mode 100644
index 00000000000000..2bffce842b9526
--- /dev/null
+++ b/flang/runtime/CUDA/init.cpp
@@ -0,0 +1,25 @@
+//===-- runtime/CUDA/init.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/init.h"
+#include "../environment.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+extern "C" {
+
+void RTDEF(CUFInit)() {
+  // Perform ctx initialization based on execution environment if necessary.
+  if (Fortran::runtime::executionEnvironment.cudaStackLimit) {
+    CUDA_REPORT_IF_ERROR(cudaDeviceSetLimit(cudaLimitStackSize,
+        Fortran::runtime::executionEnvironment.cudaStackLimit));
+  }
+}
+}
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index 52b1d99ba536ed..0f927587fb4f88 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -143,6 +143,17 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("CUDA_STACKLIMIT")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n < std::numeric_limits<int>::max() && *end == '\0') {
+      cudaStackLimit = n;
+    } else {
+      std::fprintf(stderr,
+          "Fortran runtime: CUDA_STACKLIMIT=%s is invalid; ignored\n", x);
+    }
+  }
+
   // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
 }
 
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index b8b9f10e4e57f5..184f0eb8653a65 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -56,6 +56,9 @@ struct ExecutionEnvironment {
   bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
+
+  // CUDA Fortran related variables
+  std::size_t cudaStackLimit{0}; // CUDA_STACKLIMIT
 };
 
 RT_OFFLOAD_VAR_GROUP_BEGIN

>From b3bcc4ec56cdee0864a962078f797a8e14583204 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 28 Jan 2025 16:09:19 -0800
Subject: [PATCH 2/2] Update comparison

---
 flang/runtime/environment.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index 0f927587fb4f88..ee2e1e94fa5f30 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -145,8 +145,8 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
 
   if (auto *x{std::getenv("CUDA_STACKLIMIT")}) {
     char *end;
-    auto n{std::strtol(x, &end, 10)};
-    if (n >= 0 && n < std::numeric_limits<int>::max() && *end == '\0') {
+    auto n{std::strtoul(x, &end, 10)};
+    if (n > 0 && n < std::numeric_limits<std::size_t>::max() && *end == '\0') {
       cudaStackLimit = n;
     } else {
       std::fprintf(stderr,