[libc-commits] [libc] bc11bb3 - [libc] Add the '--threads' and '--blocks' option to the GPU loaders

Wed Apr 19 06:02:10 PDT 2023

Author: Joseph Huber
Date: 2023-04-19T08:01:58-05:00
New Revision: bc11bb3e26e98b167737cee94ca23a6fb5a40881

URL: https://github.com/llvm/llvm-project/commit/bc11bb3e26e98b167737cee94ca23a6fb5a40881
DIFF: https://github.com/llvm/llvm-project/commit/bc11bb3e26e98b167737cee94ca23a6fb5a40881.diff

LOG: [libc] Add the '--threads' and '--blocks' option to the GPU loaders

We will want to test the GPU `libc` with multiple threads in the future.
This patch adds the `--threads` and `--blocks` option to set the `x`
dimension of the kernel. Using CUDA terminology instead of OpenCL for
familiarity.

Depends on D148288 D148342

Reviewed By: jdoerfert, sivachandra, tra

Differential Revision: https://reviews.llvm.org/D148485

Added: 
    

Modified: 
    libc/utils/gpu/loader/Loader.h
    libc/utils/gpu/loader/Main.cpp
    libc/utils/gpu/loader/amdgpu/Loader.cpp
    libc/utils/gpu/loader/nvptx/Loader.cpp

Removed: 
    


################################################################################
diff  --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 2c30593075845..9c6413ee45d8d 100644

--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -13,10 +13,21 @@
 #include <cstring>
 #include <stddef.h>
 
+/// Generic launch parameters for configuration the number of blocks / threads.
+struct LaunchParameters {
+  uint32_t num_threads_x;
+  uint32_t num_threads_y;
+  uint32_t num_threads_z;
+  uint32_t num_blocks_x;
+  uint32_t num_blocks_y;
+  uint32_t num_blocks_z;
+};
+
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
-int load(int argc, char **argv, char **evnp, void *image, size_t size);
+int load(int argc, char **argv, char **evnp, void *image, size_t size,
+         const LaunchParameters &params);
 
 /// Copy the system's argument vector to GPU memory allocated using \p alloc.
 template <typename Allocator>

diff  --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index 00354720dda93..b711ec91c9f30 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -15,21 +15,69 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <string>
+#include <vector>
 
 int main(int argc, char **argv, char **envp) {
   if (argc < 2) {
-    printf("USAGE: ./loader <device_image> <args>, ...\n");
+    printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+           "<args>, ...\n");
     return EXIT_SUCCESS;
   }
 
-  // TODO: We should perform some validation on the file.
-  FILE *file = fopen(argv[1], "r");
+  int offset = 0;
+  FILE *file = nullptr;
+  char *ptr;
+  LaunchParameters params = {1, 1, 1, 1, 1, 1};
+  while (!file && ++offset < argc) {
+    if (argv[offset] == std::string("--threads") ||
+        argv[offset] == std::string("--threads-x")) {
+      params.num_threads_x =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--threads-y")) {
+      params.num_threads_y =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--threads-z")) {
+      params.num_threads_z =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks") ||
+               argv[offset] == std::string("--blocks-x")) {
+      params.num_blocks_x =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks-y")) {
+      params.num_blocks_y =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks-z")) {
+      params.num_blocks_z =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else {
+      file = fopen(argv[offset], "r");
+      if (!file) {
+        fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
+        return EXIT_FAILURE;
+      }
+      break;
+    }
+  }
 
   if (!file) {
-    fprintf(stderr, "Failed to open image file %s\n", argv[1]);
+    fprintf(stderr, "No image file provided\n");
     return EXIT_FAILURE;
   }
 
+  // TODO: We should perform some validation on the file.
   fseek(file, 0, SEEK_END);
   const auto size = ftell(file);
   fseek(file, 0, SEEK_SET);
@@ -39,7 +87,7 @@ int main(int argc, char **argv, char **envp) {
   fclose(file);
 
   // Drop the loader from the program arguments.
-  int ret = load(argc - 1, &argv[1], envp, image, size);
+  int ret = load(argc - offset, &argv[offset], envp, image, size, params);
 
   free(image);
   return ret;

diff  --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 87dd3ce48d82a..54e6caf81e2d0 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -170,7 +170,8 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
   return iterate_agent_memory_pools(agent, cb);
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+         const LaunchParameters &params) {
   // Initialize the HSA runtime used to communicate with the device.
   if (hsa_status_t err = hsa_init())
     handle_error(err);
@@ -355,13 +356,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
   // with one thread on the device, forcing the rest of the wavefront to be
   // masked off.
   std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
-  packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet->workgroup_size_x = 1;
-  packet->workgroup_size_y = 1;
-  packet->workgroup_size_z = 1;
-  packet->grid_size_x = 1;
-  packet->grid_size_y = 1;
-  packet->grid_size_z = 1;
+  packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+                   (params.num_blocks_z * params.num_threads_z != 1))
+                  << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  packet->workgroup_size_x = params.num_threads_x;
+  packet->workgroup_size_y = params.num_threads_y;
+  packet->workgroup_size_z = params.num_threads_z;
+  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
   packet->private_segment_size = private_size;
   packet->group_segment_size = group_size;
   packet->kernel_object = kernel;

diff  --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index ed8b8d018c6a6..15ff11a3bd80e 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -76,7 +76,8 @@ static void handle_error(const char *msg) {
   exit(EXIT_FAILURE);
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+         const LaunchParameters &params) {
   if (CUresult err = cuInit(0))
     handle_error(err);
 
@@ -157,10 +158,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
   server.reset(server_inbox, server_outbox, buffer);
 
   // Call the kernel with the given arguments.
-  if (CUresult err =
-          cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
-                         /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
-                         /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+  if (CUresult err = cuLaunchKernel(
+          function, params.num_blocks_x, params.num_blocks_y,
+          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+          params.num_threads_z, 0, stream, nullptr, args_config))
     handle_error(err);
 
   // Wait until the kernel has completed execution on the device. Periodically