[libc-commits] [libc] [libc] Add Kernel Resource Usage to nvptx-loader (PR #97503)

via libc-commits libc-commits at lists.llvm.org
Sat Jul 6 15:52:20 PDT 2024


https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/97503

>From bd53bf13630bc84492684940ef834677ba0e320e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 26 Jun 2024 18:33:38 -0400
Subject: [PATCH 1/3] support basic resource usage using cuobjdump

---
 libc/benchmarks/gpu/CMakeLists.txt         | 17 +++++++++++++++++
 libc/cmake/modules/LLVMLibCTestRules.cmake | 10 ++++++++++
 2 files changed, 27 insertions(+)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1c..7f006329df1985 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,6 +22,23 @@ function(add_benchmark benchmark_name)
     ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
+  set(fq_build_target_name ${fq_target_name}.__build__)
+
+  set(grep_res_usage_cmd grep -A 3 "${benchmark_name}")
+  set(res_usage_cmd cuobjdump -res-usage $<TARGET_FILE:${fq_build_target_name}> | ${grep_res_usage_cmd})
+  add_custom_command(
+    OUTPUT ${fq_target_name}-cmd APPEND
+    COMMAND ${res_usage_cmd}
+    COMMAND_EXPAND_LISTS
+    COMMENT "Reading resource usage for ${benchmark_name}"
+    ${LIBC_HERMETIC_TEST_JOB_POOL}
+  )
+  # add_dependencies(${fq_target_name} ${fq_target_name}_resources)
+  # set_source_files_properties(${fq_target_name}_resources
+  #   PROPERTIES
+  #     SYMBOLIC "TRUE"
+  # )
+
   add_dependencies(gpu-benchmark ${fq_target_name})
 endfunction(add_benchmark)
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index fbeec32883b635..52bc2ad03a30d8 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -709,12 +709,22 @@ function(add_libc_hermetic test_name)
       $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
   add_custom_target(
     ${fq_target_name}
+    DEPENDS ${fq_target_name}-cmd
+  )
+
+  add_custom_command(
+    OUTPUT ${fq_target_name}-cmd
     COMMAND ${test_cmd}
     COMMAND_EXPAND_LISTS
     COMMENT "Running hermetic test ${fq_target_name}"
     ${LIBC_HERMETIC_TEST_JOB_POOL}
   )
 
+  set_source_files_properties(${fq_target_name}-cmd
+    PROPERTIES
+      SYMBOLIC "TRUE"
+  )
+
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
   if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
     # If it is a benchmark, it will already have been added to the

>From 415098692de3b9489eb7e1791e7832e93dc7eaab Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 2 Jul 2024 21:02:09 -0400
Subject: [PATCH 2/3] add resource usage to nvptx-loader

---
 libc/benchmarks/gpu/CMakeLists.txt     | 18 ++----------------
 libc/utils/gpu/loader/Loader.h         |  1 +
 libc/utils/gpu/loader/Main.cpp         |  6 +++++-
 libc/utils/gpu/loader/nvptx/Loader.cpp | 18 ++++++++++++++++++
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 7f006329df1985..4790e55bec4783 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -19,25 +19,11 @@ function(add_benchmark benchmark_name)
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
+    LOADER_ARGS
+      "--print-resource-usage"
     ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
-  set(fq_build_target_name ${fq_target_name}.__build__)
-
-  set(grep_res_usage_cmd grep -A 3 "${benchmark_name}")
-  set(res_usage_cmd cuobjdump -res-usage $<TARGET_FILE:${fq_build_target_name}> | ${grep_res_usage_cmd})
-  add_custom_command(
-    OUTPUT ${fq_target_name}-cmd APPEND
-    COMMAND ${res_usage_cmd}
-    COMMAND_EXPAND_LISTS
-    COMMENT "Reading resource usage for ${benchmark_name}"
-    ${LIBC_HERMETIC_TEST_JOB_POOL}
-  )
-  # add_dependencies(${fq_target_name} ${fq_target_name}_resources)
-  # set_source_files_properties(${fq_target_name}_resources
-  #   PROPERTIES
-  #     SYMBOLIC "TRUE"
-  # )
 
   add_dependencies(gpu-benchmark ${fq_target_name})
 endfunction(add_benchmark)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index eae2776b2773fa..f576c58d902a1a 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,6 +28,7 @@ struct LaunchParameters {
   uint32_t num_blocks_x;
   uint32_t num_blocks_y;
   uint32_t num_blocks_z;
+  bool print_resource_usage;
 };
 
 /// The arguments to the '_begin' kernel.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index b711ec91c9f304..dfaee4d8578264 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -20,7 +20,8 @@
 
 int main(int argc, char **argv, char **envp) {
   if (argc < 2) {
-    printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+    printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
+           "--print-resource-usage] <device_image> "
            "<args>, ...\n");
     return EXIT_SUCCESS;
   }
@@ -62,6 +63,9 @@ int main(int argc, char **argv, char **envp) {
           offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
       offset++;
       continue;
+    } else if (argv[offset] == std::string("--print-resource-usage")) {
+      params.print_resource_usage = true;
+      continue;
     } else {
       file = fopen(argv[offset], "r");
       if (!file) {
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 012cb778ecf153..90e52ddb008da5 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,6 +229,17 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   return CUDA_SUCCESS;
 }
 
+void print_resource_usage(CUmodule binary, const char *kernel_name) {
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+    handle_error(err);
+  int num_regs;
+  if (CUresult err =
+          cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+    handle_error(err);
+  fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
   if (CUresult err = cuInit(0))
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
+  // Print resource usage if requested.
+  if (params.print_resource_usage) {
+    print_resource_usage(binary, "_begin");
+    print_resource_usage(binary, "_start");
+    print_resource_usage(binary, "_end");
+  }
+
   end_args_t fini_args = {host_ret};
   if (CUresult err = launch_kernel(binary, stream, rpc_device,
                                    single_threaded_params, "_end", fini_args))

>From 33d986a77ba791912fab6456c6cd7b478370ae14 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 6 Jul 2024 18:51:52 -0400
Subject: [PATCH 3/3] printing resource usage should be separate from launching
 kernels

---
 libc/benchmarks/gpu/CMakeLists.txt         | 20 ++++++++++--
 libc/cmake/modules/LLVMLibCTestRules.cmake |  4 +--
 libc/utils/gpu/loader/Loader.h             |  3 ++
 libc/utils/gpu/loader/Main.cpp             |  8 ++++-
 libc/utils/gpu/loader/amdgpu/Loader.cpp    |  5 +++
 libc/utils/gpu/loader/nvptx/Loader.cpp     | 37 ++++++++++++++++------
 6 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 4790e55bec4783..e6ac40cb17eaf0 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -15,15 +15,29 @@ function(add_benchmark benchmark_name)
   endif()
   add_libc_hermetic(
     ${benchmark_name}
-    IS_BENCHMARK
+    IS_GPU_BENCHMARK
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
-    LOADER_ARGS
-      "--print-resource-usage"
     ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
+  set(fq_build_target_name ${fq_target_name}.__build__)
+
+  # We want to dump kernel resource usage for GPU benchmarks
+  get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+  set(res_usage_cmd $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
+    ${CMAKE_CROSSCOMPILING_EMULATOR}
+    --print-resource-usage
+    $<TARGET_FILE:${fq_build_target_name}>
+  )
+  add_custom_command(
+    OUTPUT ${fq_target_name}-cmd APPEND
+    COMMAND ${CMAKE_COMMAND} -E echo "Reading resource usage for ${benchmark_name}:"
+    COMMAND ${res_usage_cmd}
+    COMMAND_EXPAND_LISTS
+    ${LIBC_HERMETIC_TEST_JOB_POOL}
+  )
 
   add_dependencies(gpu-benchmark ${fq_target_name})
 endfunction(add_benchmark)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 52bc2ad03a30d8..4d349cb1799da0 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
   endif()
   cmake_parse_arguments(
     "HERMETIC_TEST"
-    "IS_BENCHMARK" # Optional arguments
+    "IS_GPU_BENCHMARK" # Optional arguments
     "SUITE" # Single value arguments
     "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
     ${ARGN}
@@ -726,7 +726,7 @@ function(add_libc_hermetic test_name)
   )
 
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
-  if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
+  if(NOT ${HERMETIC_TEST_IS_GPU_BENCHMARK})
     # If it is a benchmark, it will already have been added to the
     # gpu-benchmark target
     add_dependencies(libc-hermetic-tests ${fq_target_name})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index f576c58d902a1a..08b7b827618dea 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -51,6 +51,9 @@ struct end_args_t {
   int argc;
 };
 
+/// Generic interface to print resources for all kernels in a GPU binary.
+void print_resources(void *image);
+
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index dfaee4d8578264..c7b98d2336b784 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -30,6 +30,7 @@ int main(int argc, char **argv, char **envp) {
   FILE *file = nullptr;
   char *ptr;
   LaunchParameters params = {1, 1, 1, 1, 1, 1};
+  bool print_resource_usage = false;
   while (!file && ++offset < argc) {
     if (argv[offset] == std::string("--threads") ||
         argv[offset] == std::string("--threads-x")) {
@@ -64,7 +65,7 @@ int main(int argc, char **argv, char **envp) {
       offset++;
       continue;
     } else if (argv[offset] == std::string("--print-resource-usage")) {
-      params.print_resource_usage = true;
+      print_resource_usage = true;
       continue;
     } else {
       file = fopen(argv[offset], "r");
@@ -90,6 +91,11 @@ int main(int argc, char **argv, char **envp) {
   fread(image, sizeof(char), size, file);
   fclose(file);
 
+  if (print_resource_usage) {
+    print_resources(image);
+    return 0;
+  }
+
   // Drop the loader from the program arguments.
   int ret = load(argc - offset, &argv[offset], envp, image, size, params);
 
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index f8d178be7a517d..dc3fd72a84b012 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -326,6 +326,11 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
   return HSA_STATUS_SUCCESS;
 }
 
+void print_resources(void *image) {
+  fprintf(stderr, "Printing resource usage on AMDGPU is not supported yet.\n");
+  exit(EXIT_FAILURE);
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
   // Initialize the HSA runtime used to communicate with the device.
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 90e52ddb008da5..c6f088c2bfb1d3 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,7 +229,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   return CUDA_SUCCESS;
 }
 
-void print_resource_usage(CUmodule binary, const char *kernel_name) {
+void print_kernel_resources(CUmodule binary, const char *kernel_name) {
   CUfunction function;
   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
     handle_error(err);
@@ -237,7 +237,33 @@ void print_resource_usage(CUmodule binary, const char *kernel_name) {
   if (CUresult err =
           cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
     handle_error(err);
-  fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+  printf("%6s registers: %d\n", kernel_name, num_regs);
+}
+
+void print_resources(void *image) {
+  if (CUresult err = cuInit(0))
+    handle_error(err);
+
+  // Obtain the first device found on the system.
+  uint32_t device_id = 0;
+  CUdevice device;
+  if (CUresult err = cuDeviceGet(&device, device_id))
+    handle_error(err);
+
+  // Initialize the CUDA context and claim it.
+  CUcontext context;
+  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+    handle_error(err);
+  if (CUresult err = cuCtxSetCurrent(context))
+    handle_error(err);
+
+  CUmodule binary;
+  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+    handle_error(err);
+
+  print_kernel_resources(binary, "_begin");
+  print_kernel_resources(binary, "_start");
+  print_kernel_resources(binary, "_end");
 }
 
 int load(int argc, char **argv, char **envp, void *image, size_t size,
@@ -352,13 +378,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
-  // Print resource usage if requested.
-  if (params.print_resource_usage) {
-    print_resource_usage(binary, "_begin");
-    print_resource_usage(binary, "_start");
-    print_resource_usage(binary, "_end");
-  }
-
   end_args_t fini_args = {host_ret};
   if (CUresult err = launch_kernel(binary, stream, rpc_device,
                                    single_threaded_params, "_end", fini_args))



More information about the libc-commits mailing list