[libc-commits] [libc] [libc] Add Kernel Resource Usage to nvptx-loader (PR #97503)
via libc-commits
libc-commits at lists.llvm.org
Sun Jul 14 18:37:20 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/97503
>From 7b19e5965143a3a902931fc6a2a2a2873e8c9665 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 26 Jun 2024 18:33:38 -0400
Subject: [PATCH 1/4] support basic resource usage using cuobjdump
---
libc/benchmarks/gpu/CMakeLists.txt | 17 +++++++++++++++++
libc/cmake/modules/LLVMLibCTestRules.cmake | 10 ++++++++++
2 files changed, 27 insertions(+)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1..7f006329df198 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,6 +22,23 @@ function(add_benchmark benchmark_name)
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
+ set(fq_build_target_name ${fq_target_name}.__build__)
+
+ set(grep_res_usage_cmd grep -A 3 "${benchmark_name}")
+ set(res_usage_cmd cuobjdump -res-usage $<TARGET_FILE:${fq_build_target_name}> | ${grep_res_usage_cmd})
+ add_custom_command(
+ OUTPUT ${fq_target_name}-cmd APPEND
+ COMMAND ${res_usage_cmd}
+ COMMAND_EXPAND_LISTS
+ COMMENT "Reading resource usage for ${benchmark_name}"
+ ${LIBC_HERMETIC_TEST_JOB_POOL}
+ )
+ # add_dependencies(${fq_target_name} ${fq_target_name}_resources)
+ # set_source_files_properties(${fq_target_name}_resources
+ # PROPERTIES
+ # SYMBOLIC "TRUE"
+ # )
+
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index fbeec32883b63..52bc2ad03a30d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -709,12 +709,22 @@ function(add_libc_hermetic test_name)
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
add_custom_target(
${fq_target_name}
+ DEPENDS ${fq_target_name}-cmd
+ )
+
+ add_custom_command(
+ OUTPUT ${fq_target_name}-cmd
COMMAND ${test_cmd}
COMMAND_EXPAND_LISTS
COMMENT "Running hermetic test ${fq_target_name}"
${LIBC_HERMETIC_TEST_JOB_POOL}
)
+ set_source_files_properties(${fq_target_name}-cmd
+ PROPERTIES
+ SYMBOLIC "TRUE"
+ )
+
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
# If it is a benchmark, it will already have been added to the
>From a49e872669bbb443941d0dd347d548be6d6f0808 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 2 Jul 2024 21:02:09 -0400
Subject: [PATCH 2/4] add resource usage to nvptx-loader
---
libc/benchmarks/gpu/CMakeLists.txt | 18 ++----------------
libc/utils/gpu/loader/Loader.h | 1 +
libc/utils/gpu/loader/Main.cpp | 6 +++++-
libc/utils/gpu/loader/nvptx/Loader.cpp | 18 ++++++++++++++++++
4 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 7f006329df198..4790e55bec478 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -19,25 +19,11 @@ function(add_benchmark benchmark_name)
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
+ LOADER_ARGS
+ "--print-resource-usage"
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
- set(fq_build_target_name ${fq_target_name}.__build__)
-
- set(grep_res_usage_cmd grep -A 3 "${benchmark_name}")
- set(res_usage_cmd cuobjdump -res-usage $<TARGET_FILE:${fq_build_target_name}> | ${grep_res_usage_cmd})
- add_custom_command(
- OUTPUT ${fq_target_name}-cmd APPEND
- COMMAND ${res_usage_cmd}
- COMMAND_EXPAND_LISTS
- COMMENT "Reading resource usage for ${benchmark_name}"
- ${LIBC_HERMETIC_TEST_JOB_POOL}
- )
- # add_dependencies(${fq_target_name} ${fq_target_name}_resources)
- # set_source_files_properties(${fq_target_name}_resources
- # PROPERTIES
- # SYMBOLIC "TRUE"
- # )
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index eae2776b2773f..f576c58d902a1 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,6 +28,7 @@ struct LaunchParameters {
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
+ bool print_resource_usage;
};
/// The arguments to the '_begin' kernel.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index b711ec91c9f30..dfaee4d857826 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -20,7 +20,8 @@
int main(int argc, char **argv, char **envp) {
if (argc < 2) {
- printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+ printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
+ "--print-resource-usage] <device_image> "
"<args>, ...\n");
return EXIT_SUCCESS;
}
@@ -62,6 +63,9 @@ int main(int argc, char **argv, char **envp) {
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
+ } else if (argv[offset] == std::string("--print-resource-usage")) {
+ params.print_resource_usage = true;
+ continue;
} else {
file = fopen(argv[offset], "r");
if (!file) {
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 012cb778ecf15..90e52ddb008da 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,6 +229,17 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
return CUDA_SUCCESS;
}
+void print_resource_usage(CUmodule binary, const char *kernel_name) {
+ CUfunction function;
+ if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+ handle_error(err);
+ int num_regs;
+ if (CUresult err =
+ cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+ handle_error(err);
+ fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+}
+
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters ¶ms) {
if (CUresult err = cuInit(0))
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
+ // Print resource usage if requested.
+ if (params.print_resource_usage) {
+ print_resource_usage(binary, "_begin");
+ print_resource_usage(binary, "_start");
+ print_resource_usage(binary, "_end");
+ }
+
end_args_t fini_args = {host_ret};
if (CUresult err = launch_kernel(binary, stream, rpc_device,
single_threaded_params, "_end", fini_args))
>From 5d4c84f5f9ef8af5a0621e891feb1107c34e2264 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 6 Jul 2024 18:51:52 -0400
Subject: [PATCH 3/4] printing resource usage should be separate from launching
kernels
---
libc/benchmarks/gpu/CMakeLists.txt | 20 ++++++++++--
libc/cmake/modules/LLVMLibCTestRules.cmake | 4 +--
libc/utils/gpu/loader/Loader.h | 3 ++
libc/utils/gpu/loader/Main.cpp | 8 ++++-
libc/utils/gpu/loader/amdgpu/Loader.cpp | 5 +++
libc/utils/gpu/loader/nvptx/Loader.cpp | 37 ++++++++++++++++------
6 files changed, 62 insertions(+), 15 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 4790e55bec478..e6ac40cb17eaf 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -15,15 +15,29 @@ function(add_benchmark benchmark_name)
endif()
add_libc_hermetic(
${benchmark_name}
- IS_BENCHMARK
+ IS_GPU_BENCHMARK
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
- LOADER_ARGS
- "--print-resource-usage"
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
+ set(fq_build_target_name ${fq_target_name}.__build__)
+
+ # We want to dump kernel resource usage for GPU benchmarks
+ get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+ set(res_usage_cmd $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
+ ${CMAKE_CROSSCOMPILING_EMULATOR}
+ --print-resource-usage
+ $<TARGET_FILE:${fq_build_target_name}>
+ )
+ add_custom_command(
+ OUTPUT ${fq_target_name}-cmd APPEND
+ COMMAND ${CMAKE_COMMAND} -E echo "Reading resource usage for ${benchmark_name}:"
+ COMMAND ${res_usage_cmd}
+ COMMAND_EXPAND_LISTS
+ ${LIBC_HERMETIC_TEST_JOB_POOL}
+ )
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 52bc2ad03a30d..4d349cb1799da 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
endif()
cmake_parse_arguments(
"HERMETIC_TEST"
- "IS_BENCHMARK" # Optional arguments
+ "IS_GPU_BENCHMARK" # Optional arguments
"SUITE" # Single value arguments
"SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
${ARGN}
@@ -726,7 +726,7 @@ function(add_libc_hermetic test_name)
)
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
- if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
+ if(NOT ${HERMETIC_TEST_IS_GPU_BENCHMARK})
# If it is a benchmark, it will already have been added to the
# gpu-benchmark target
add_dependencies(libc-hermetic-tests ${fq_target_name})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index f576c58d902a1..08b7b827618de 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -51,6 +51,9 @@ struct end_args_t {
int argc;
};
+/// Generic interface to print resources for all kernels in a GPU binary.
+void print_resources(void *image);
+
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index dfaee4d857826..c7b98d2336b78 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -30,6 +30,7 @@ int main(int argc, char **argv, char **envp) {
FILE *file = nullptr;
char *ptr;
LaunchParameters params = {1, 1, 1, 1, 1, 1};
+ bool print_resource_usage = false;
while (!file && ++offset < argc) {
if (argv[offset] == std::string("--threads") ||
argv[offset] == std::string("--threads-x")) {
@@ -64,7 +65,7 @@ int main(int argc, char **argv, char **envp) {
offset++;
continue;
} else if (argv[offset] == std::string("--print-resource-usage")) {
- params.print_resource_usage = true;
+ print_resource_usage = true;
continue;
} else {
file = fopen(argv[offset], "r");
@@ -90,6 +91,11 @@ int main(int argc, char **argv, char **envp) {
fread(image, sizeof(char), size, file);
fclose(file);
+ if (print_resource_usage) {
+ print_resources(image);
+ return 0;
+ }
+
// Drop the loader from the program arguments.
int ret = load(argc - offset, &argv[offset], envp, image, size, params);
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index f8d178be7a517..dc3fd72a84b01 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -326,6 +326,11 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
return HSA_STATUS_SUCCESS;
}
+void print_resources(void *image) {
+ fprintf(stderr, "Printing resource usage on AMDGPU is not supported yet.\n");
+ exit(EXIT_FAILURE);
+}
+
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters ¶ms) {
// Initialize the HSA runtime used to communicate with the device.
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 90e52ddb008da..c6f088c2bfb1d 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,7 +229,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
return CUDA_SUCCESS;
}
-void print_resource_usage(CUmodule binary, const char *kernel_name) {
+void print_kernel_resources(CUmodule binary, const char *kernel_name) {
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
handle_error(err);
@@ -237,7 +237,33 @@ void print_resource_usage(CUmodule binary, const char *kernel_name) {
if (CUresult err =
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
handle_error(err);
- fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+ printf("%6s registers: %d\n", kernel_name, num_regs);
+}
+
+void print_resources(void *image) {
+ if (CUresult err = cuInit(0))
+ handle_error(err);
+
+ // Obtain the first device found on the system.
+ uint32_t device_id = 0;
+ CUdevice device;
+ if (CUresult err = cuDeviceGet(&device, device_id))
+ handle_error(err);
+
+ // Initialize the CUDA context and claim it.
+ CUcontext context;
+ if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+ handle_error(err);
+ if (CUresult err = cuCtxSetCurrent(context))
+ handle_error(err);
+
+ CUmodule binary;
+ if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+ handle_error(err);
+
+ print_kernel_resources(binary, "_begin");
+ print_kernel_resources(binary, "_start");
+ print_kernel_resources(binary, "_end");
}
int load(int argc, char **argv, char **envp, void *image, size_t size,
@@ -352,13 +378,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
- // Print resource usage if requested.
- if (params.print_resource_usage) {
- print_resource_usage(binary, "_begin");
- print_resource_usage(binary, "_start");
- print_resource_usage(binary, "_end");
- }
-
end_args_t fini_args = {host_ret};
if (CUresult err = launch_kernel(binary, stream, rpc_device,
single_threaded_params, "_end", fini_args))
>From 34ae76a1bd603922836df0ac75f956e20cf86b99 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 14 Jul 2024 21:34:15 -0400
Subject: [PATCH 4/4] print kernel resources when launching kernels
---
libc/benchmarks/gpu/CMakeLists.txt | 17 +-----
libc/utils/gpu/loader/Loader.h | 6 +--
libc/utils/gpu/loader/Main.cpp | 8 +--
libc/utils/gpu/loader/amdgpu/Loader.cpp | 41 +++++++-------
libc/utils/gpu/loader/nvptx/Loader.cpp | 71 +++++++++----------------
5 files changed, 54 insertions(+), 89 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index e6ac40cb17eaf..394f0e75aaf38 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -19,26 +19,13 @@ function(add_benchmark benchmark_name)
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
+ LOADER_ARGS
+ --print-resource-usage
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
set(fq_build_target_name ${fq_target_name}.__build__)
- # We want to dump kernel resource usage for GPU benchmarks
- get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
- set(res_usage_cmd $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
- ${CMAKE_CROSSCOMPILING_EMULATOR}
- --print-resource-usage
- $<TARGET_FILE:${fq_build_target_name}>
- )
- add_custom_command(
- OUTPUT ${fq_target_name}-cmd APPEND
- COMMAND ${CMAKE_COMMAND} -E echo "Reading resource usage for ${benchmark_name}:"
- COMMAND ${res_usage_cmd}
- COMMAND_EXPAND_LISTS
- ${LIBC_HERMETIC_TEST_JOB_POOL}
- )
-
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 08b7b827618de..e029816764427 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,7 +28,6 @@ struct LaunchParameters {
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
- bool print_resource_usage;
};
/// The arguments to the '_begin' kernel.
@@ -51,14 +50,11 @@ struct end_args_t {
int argc;
};
-/// Generic interface to print resources for all kernels in a GPU binary.
-void print_resources(void *image);
-
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
int load(int argc, char **argv, char **evnp, void *image, size_t size,
- const LaunchParameters ¶ms);
+ const LaunchParameters ¶ms, bool print_resource_usage);
/// Return \p V aligned "upwards" according to \p Align.
template <typename V, typename A> inline V align_up(V val, A align) {
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index c7b98d2336b78..a9c0b868725d0 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -91,13 +91,9 @@ int main(int argc, char **argv, char **envp) {
fread(image, sizeof(char), size, file);
fclose(file);
- if (print_resource_usage) {
- print_resources(image);
- return 0;
- }
-
// Drop the loader from the program arguments.
- int ret = load(argc - offset, &argv[offset], envp, image, size, params);
+ int ret = load(argc - offset, &argv[offset], envp, image, size, params,
+ print_resource_usage);
free(image);
return ret;
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index dc3fd72a84b01..a9ce36194d94d 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -125,6 +125,10 @@ hsa_status_t get_agent(hsa_agent_t *output_agent) {
return iterate_agents(cb);
}
+void print_kernel_resources(char *kernel_name) {
+ fprintf("Kernel resources on AMDGPU is not supported yet.\n");
+}
+
/// Retrieve a global memory pool with a \p flag from the agent.
template <hsa_amd_memory_pool_global_flag_t flag>
hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
@@ -156,8 +160,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_amd_memory_pool_t coarsegrained_pool,
hsa_queue_t *queue, rpc_device_t device,
const LaunchParameters ¶ms,
- const char *kernel_name, args_t kernel_args) {
- // Look up the '_start' kernel in the loaded executable.
+ const char *kernel_name, args_t kernel_args,
+ bool print_resource_usage) {
+ // Look up the kernel in the loaded executable.
hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, kernel_name, &dev_agent, &symbol))
@@ -220,7 +225,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
- // Initialie all the arguments (explicit and implicit) to zero, then set the
+ // Initialize all the arguments (explicit and implicit) to zero, then set the
// explicit arguments to the values created above.
std::memset(args, 0, args_size);
std::memcpy(args, &kernel_args, sizeof(args_t));
@@ -270,6 +275,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
handle_error(err);
+ if (print_resource_usage)
+ print_kernel_resources(kernel_name);
+
// Initialize the packet header and set the doorbell signal to begin execution
// by the HSA runtime.
uint16_t header =
@@ -326,13 +334,8 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
return HSA_STATUS_SUCCESS;
}
-void print_resources(void *image) {
- fprintf(stderr, "Printing resource usage on AMDGPU is not supported yet.\n");
- exit(EXIT_FAILURE);
-}
-
int load(int argc, char **argv, char **envp, void *image, size_t size,
- const LaunchParameters ¶ms) {
+ const LaunchParameters ¶ms, bool print_resource_usage) {
// Initialize the HSA runtime used to communicate with the device.
if (hsa_status_t err = hsa_init())
handle_error(err);
@@ -550,15 +553,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
- if (hsa_status_t err = launch_kernel(
- dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
- device, single_threaded_params, "_begin.kd", init_args))
+ if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
+ coarsegrained_pool, queue, device,
+ single_threaded_params, "_begin.kd",
+ init_args, print_resource_usage))
handle_error(err);
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
- if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, device,
- params, "_start.kd", args))
+ if (hsa_status_t err = launch_kernel(
+ dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+ device, params, "_start.kd", args, print_resource_usage))
handle_error(err);
void *host_ret;
@@ -576,9 +580,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
int ret = *static_cast<int *>(host_ret);
end_args_t fini_args = {ret};
- if (hsa_status_t err = launch_kernel(
- dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
- device, single_threaded_params, "_end.kd", fini_args))
+ if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
+ coarsegrained_pool, queue, device,
+ single_threaded_params, "_end.kd",
+ fini_args, print_resource_usage))
handle_error(err);
if (rpc_status_t err = rpc_server_shutdown(
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index c6f088c2bfb1d..9c3cf3ae19b41 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -152,10 +152,23 @@ Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
return dev_memory;
}
+void print_kernel_resources(CUmodule binary, const char *kernel_name) {
+ CUfunction function;
+ if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+ handle_error(err);
+ int num_regs;
+ if (CUresult err =
+ cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+ handle_error(err);
+ printf("Executing kernel %s:\n", kernel_name);
+ printf("%6s registers: %d\n", kernel_name, num_regs);
+}
+
template <typename args_t>
CUresult launch_kernel(CUmodule binary, CUstream stream,
rpc_device_t rpc_device, const LaunchParameters ¶ms,
- const char *kernel_name, args_t kernel_args) {
+ const char *kernel_name, args_t kernel_args,
+ bool print_resource_usage) {
// look up the '_start' kernel in the loaded module.
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
@@ -208,6 +221,9 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
},
&memory_stream);
+ if (print_resource_usage)
+ print_kernel_resources(binary, kernel_name);
+
// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
@@ -229,45 +245,8 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
return CUDA_SUCCESS;
}
-void print_kernel_resources(CUmodule binary, const char *kernel_name) {
- CUfunction function;
- if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
- handle_error(err);
- int num_regs;
- if (CUresult err =
- cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
- handle_error(err);
- printf("%6s registers: %d\n", kernel_name, num_regs);
-}
-
-void print_resources(void *image) {
- if (CUresult err = cuInit(0))
- handle_error(err);
-
- // Obtain the first device found on the system.
- uint32_t device_id = 0;
- CUdevice device;
- if (CUresult err = cuDeviceGet(&device, device_id))
- handle_error(err);
-
- // Initialize the CUDA context and claim it.
- CUcontext context;
- if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
- handle_error(err);
- if (CUresult err = cuCtxSetCurrent(context))
- handle_error(err);
-
- CUmodule binary;
- if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
- handle_error(err);
-
- print_kernel_resources(binary, "_begin");
- print_kernel_resources(binary, "_start");
- print_kernel_resources(binary, "_end");
-}
-
int load(int argc, char **argv, char **envp, void *image, size_t size,
- const LaunchParameters ¶ms) {
+ const LaunchParameters ¶ms, bool print_resource_usage) {
if (CUresult err = cuInit(0))
handle_error(err);
// Obtain the first device found on the system.
@@ -360,14 +339,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
- if (CUresult err = launch_kernel(binary, stream, rpc_device,
- single_threaded_params, "_begin", init_args))
+ if (CUresult err =
+ launch_kernel(binary, stream, rpc_device, single_threaded_params,
+ "_begin", init_args, print_resource_usage))
handle_error(err);
start_args_t args = {argc, dev_argv, dev_envp,
reinterpret_cast<void *>(dev_ret)};
- if (CUresult err =
- launch_kernel(binary, stream, rpc_device, params, "_start", args))
+ if (CUresult err = launch_kernel(binary, stream, rpc_device, params, "_start",
+ args, print_resource_usage))
handle_error(err);
// Copy the return value back from the kernel and wait.
@@ -379,8 +359,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
handle_error(err);
end_args_t fini_args = {host_ret};
- if (CUresult err = launch_kernel(binary, stream, rpc_device,
- single_threaded_params, "_end", fini_args))
+ if (CUresult err =
+ launch_kernel(binary, stream, rpc_device, single_threaded_params,
+ "_end", fini_args, print_resource_usage))
handle_error(err);
// Free the memory allocated for the device.
More information about the libc-commits
mailing list