[flang-commits] [flang] [flang][cuda] Allow to set the stack limit size (PR #124859)

Tue Jan 28 15:55:24 PST 2025

https://github.com/clementval created https://github.com/llvm/llvm-project/pull/124859

This patch adds a call to the CUFInit function just after `ProgramStart` when CUDA Fortran is enabled to initialize the CUDA context. This allows us to set up some context information like the stack limit that can be defined by an environment variable `CUDA_STACKLIMIT=<value>`. 

>From 3641398393de637b3d172bdae52ab127368745e2 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 28 Jan 2025 15:52:08 -0800
Subject: [PATCH] [flang][cuda] Allow to set the stack limit size

---
 flang/CMakeLists.txt                          | 13 +++++-----
 .../flang/Optimizer/Builder/Runtime/Main.h    |  3 ++-
 flang/include/flang/Runtime/CUDA/init.h       | 20 +++++++++++++++
 flang/lib/Lower/Bridge.cpp                    |  4 ++-
 flang/lib/Optimizer/Builder/Runtime/Main.cpp  | 15 ++++++++++-
 flang/runtime/CUDA/CMakeLists.txt             |  1 +
 flang/runtime/CUDA/init.cpp                   | 25 +++++++++++++++++++
 flang/runtime/environment.cpp                 | 11 ++++++++
 flang/runtime/environment.h                   |  3 +++
 9 files changed, 86 insertions(+), 9 deletions(-)
 create mode 100644 flang/include/flang/Runtime/CUDA/init.h
 create mode 100644 flang/runtime/CUDA/init.cpp

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index b619553ef83021..fb7ab4759ad37e 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -471,6 +471,13 @@ if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
 endif()
 
+option(FLANG_CUF_RUNTIME
+  "Compile CUDA Fortran runtime sources" OFF)
+if (FLANG_CUF_RUNTIME)
+  find_package(CUDAToolkit REQUIRED)
+  add_compile_definitions(FLANG_CUDA_SUPPORT=1)
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(cmake/modules)
@@ -481,12 +488,6 @@ if (FLANG_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
 
-option(FLANG_CUF_RUNTIME
-  "Compile CUDA Fortran runtime sources" OFF)
-if (FLANG_CUF_RUNTIME)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
 add_subdirectory(runtime)
 
 if (LLVM_INCLUDE_EXAMPLES)
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
index e4c5dc914c700b..a0586deade42aa 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Main.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -24,7 +24,8 @@ class GlobalOp;
 namespace fir::runtime {
 
 void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
-             const std::vector<Fortran::lower::EnvironmentDefault> &defs);
+             const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+             bool initCuda = false);
 }
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Runtime/CUDA/init.h b/flang/include/flang/Runtime/CUDA/init.h
new file mode 100644
index 00000000000000..24bc6838227208
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/init.h
@@ -0,0 +1,20 @@
+//===-- include/flang/Runtime/CUDA/init.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_INIT_H_
+#define FORTRAN_RUNTIME_CUDA_INIT_H_
+
+#include "common.h"
+#include "flang/Runtime/entry-names.h"
+
+extern "C" {
+
+void RTDECL(CUFInit)();
+}
+
+#endif // FORTRAN_RUNTIME_CUDA_INIT_H_
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index d92dc0cf9abd62..ff80826216e4f5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -459,7 +459,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (hasMainProgram)
       createGlobalOutsideOfFunctionLowering([&]() {
         fir::runtime::genMain(*builder, toLocation(),
-                              bridge.getEnvironmentDefaults());
+                              bridge.getEnvironmentDefaults(),
+                              getFoldingContext().languageFeatures().IsEnabled(
+                                  Fortran::common::LanguageFeature::CUDA));
       });
 
     finalizeOpenACCLowering();
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
index ab3c4ca81314ce..5156fd54020777 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -16,13 +16,17 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
+#ifdef FLANG_CUDA_SUPPORT
+#include "flang/Runtime/CUDA/init.h"
+#endif
 
 using namespace Fortran::runtime;
 
 /// Create a `int main(...)` that calls the Fortran entry point
 void fir::runtime::genMain(
     fir::FirOpBuilder &builder, mlir::Location loc,
-    const std::vector<Fortran::lower::EnvironmentDefault> &defs) {
+    const std::vector<Fortran::lower::EnvironmentDefault> &defs,
+    bool initCuda) {
   auto *context = builder.getContext();
   auto argcTy = builder.getDefaultIntegerType();
   auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
@@ -61,6 +65,15 @@ void fir::runtime::genMain(
   args.push_back(env);
 
   builder.create<fir::CallOp>(loc, startFn, args);
+
+#ifdef FLANG_CUDA_SUPPORT
+  if (initCuda) {
+    auto initFn = builder.createFunction(
+        loc, RTNAME_STRING(CUFInit), mlir::FunctionType::get(context, {}, {}));
+    builder.create<fir::CallOp>(loc, initFn);
+  }
+#endif
+
   builder.create<fir::CallOp>(loc, qqMainFn);
   builder.create<fir::CallOp>(loc, stopFn);
 
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 23e01da72eded1..bfbae58086c1fd 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -17,6 +17,7 @@ add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
   allocatable.cpp
   descriptor.cpp
+  init.cpp
   kernel.cpp
   memmove-function.cpp
   memory.cpp
diff --git a/flang/runtime/CUDA/init.cpp b/flang/runtime/CUDA/init.cpp
new file mode 100644
index 00000000000000..2bffce842b9526
--- /dev/null
+++ b/flang/runtime/CUDA/init.cpp
@@ -0,0 +1,25 @@
+//===-- runtime/CUDA/init.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/init.h"
+#include "../environment.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+extern "C" {
+
+void RTDEF(CUFInit)() {
+  // Perform ctx initialization based on execution environment if necessary.
+  if (Fortran::runtime::executionEnvironment.cudaStackLimit) {
+    CUDA_REPORT_IF_ERROR(cudaDeviceSetLimit(cudaLimitStackSize,
+        Fortran::runtime::executionEnvironment.cudaStackLimit));
+  }
+}
+}
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index 52b1d99ba536ed..0f927587fb4f88 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -143,6 +143,17 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("CUDA_STACKLIMIT")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n < std::numeric_limits<int>::max() && *end == '\0') {
+      cudaStackLimit = n;
+    } else {
+      std::fprintf(stderr,
+          "Fortran runtime: CUDA_STACKLIMIT=%s is invalid; ignored\n", x);
+    }
+  }
+
   // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
 }
 
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index b8b9f10e4e57f5..184f0eb8653a65 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -56,6 +56,9 @@ struct ExecutionEnvironment {
   bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
+
+  // CUDA Fortran related variables
+  std::size_t cudaStackLimit{0}; // CUDA_STACKLIMIT
 };
 
 RT_OFFLOAD_VAR_GROUP_BEGIN