[libc] [llvm] [LLVM] Port 'llvm-gpu-loader' to use LLVMOffload (PR #162739)

Fri Oct 10 09:10:39 PDT 2025

================
@@ -35,121 +35,255 @@
 
 using namespace llvm;
 
-static cl::OptionCategory loader_category("loader options");
+static cl::OptionCategory LoaderCategory("loader options");
 
-static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
-                          cl::cat(loader_category));
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
+                          cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
-              cl::init(1), cl::cat(loader_category));
+    Threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
-              cl::init(1), cl::cat(loader_category));
-static cl::alias threads("threads", cl::aliasopt(threads_x),
+    Threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
+              cl::init(1), cl::cat(LoaderCategory));
+static cl::alias threads("threads", cl::aliasopt(Threads_x),
                          cl::desc("Alias for --threads-x"),
-                         cl::cat(loader_category));
+                         cl::cat(LoaderCategory));
 
 static cl::opt<unsigned>
-    blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
-             cl::init(1), cl::cat(loader_category));
+    Blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
 static cl::opt<unsigned>
-    blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
-             cl::init(1), cl::cat(loader_category));
-static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
+    Blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
+             cl::init(1), cl::cat(LoaderCategory));
+static cl::alias Blocks("blocks", cl::aliasopt(Blocks_x),
                         cl::desc("Alias for --blocks-x"),
-                        cl::cat(loader_category));
+                        cl::cat(LoaderCategory));
 
-static cl::opt<bool>
-    print_resource_usage("print-resource-usage",
-                         cl::desc("Output resource usage of launched kernels"),
-                         cl::init(false), cl::cat(loader_category));
-
-static cl::opt<std::string> file(cl::Positional, cl::Required,
+static cl::opt<std::string> File(cl::Positional, cl::Required,
                                  cl::desc("<gpu executable>"),
-                                 cl::cat(loader_category));
-static cl::list<std::string> args(cl::ConsumeAfter,
+                                 cl::cat(LoaderCategory));
+static cl::list<std::string> Args(cl::ConsumeAfter,
                                   cl::desc("<program arguments>..."),
-                                  cl::cat(loader_category));
+                                  cl::cat(LoaderCategory));
+
+// The arguments to the '_begin' kernel.
+struct BeginArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+};
+
+// The arguments to the '_start' kernel.
+struct StartArgs {
+  int Argc;
+  void *Argv;
+  void *Envp;
+  void *Ret;
+};
 
-[[noreturn]] void report_error(Error E) {
+// The arguments to the '_end' kernel.
+struct EndArgs {};
+
+[[noreturn]] static void handleError(Error E) {
   outs().flush();
   logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
   exit(EXIT_FAILURE);
 }
 
-std::string get_main_executable(const char *name) {
-  void *ptr = (void *)(intptr_t)&get_main_executable;
-  auto cow_path = sys::fs::getMainExecutable(name, ptr);
-  return sys::path::parent_path(cow_path).str();
+[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
+  fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details);
+  exit(EXIT_FAILURE);
+}
+
+#define OFFLOAD_ERR(X)                                                         \
+  if (ol_result_t Err = X)                                                     \
+    handleError(Err, __LINE__);
+
+static void *copyArgumentVector(int Argc, const char **Argv,
+                                ol_device_handle_t Device) {
+  size_t ArgSize = sizeof(char *) * (Argc + 1);
+  size_t StringLen = 0;
+  for (int i = 0; i < Argc; ++i)
+    StringLen += strlen(Argv[i]) + 1;
+
+  // We allocate enough space for a null terminated array and all the strings.
+  void *DevArgv;
+  OFFLOAD_ERR(
+      olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
+  if (!DevArgv)
+    handleError(
+        createStringError("Failed to allocate memory for environment."));
+
+  // Store the strings linerally in the same memory buffer.
+  void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
+  for (int i = 0; i < Argc; ++i) {
+    size_t size = strlen(Argv[i]) + 1;
+    std::memcpy(DevString, Argv[i], size);
+    static_cast<void **>(DevArgv)[i] = DevString;
+    DevString = reinterpret_cast<uint8_t *>(DevString) + size;
+  }
+
+  // Ensure the vector is null terminated.
+  reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
+  return DevArgv;
+}
+
+void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
+  int Envc = 0;
+  for (const char **Env = Envp; *Env != 0; ++Env)
+    ++Envc;
+
+  return copyArgumentVector(Envc, Envp, Device);
+}
+
+ol_device_handle_t findDevice(MemoryBufferRef Binary) {
+  ol_device_handle_t Device;
+  std::tuple Data = std::make_tuple(&Device, &Binary);
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
+        bool IsValid = false;
+        OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
+                                    Binary->getBufferSize(), &IsValid));
+        if (!IsValid)
+          return true;
+
+        *Output = Device;
+        return false;
+      },
+      &Data));
+  return Device;
+}
+
+ol_device_handle_t getHostDevice() {
+  ol_device_handle_t Device;
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        ol_platform_handle_t Platform;
+        olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
+                        &Platform);
+        ol_platform_backend_t Backend;
+        olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
+                          &Backend);
+
+        auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
+        if (Backend == OL_PLATFORM_BACKEND_HOST) {
+          Output = Device;
+          return false;
+        }
+        return true;
+      },
+      &Device));
+  return Device;
+}
+
+ol_program_handle_t loadBinary(std::vector<char> &Binary,
+                               std::vector<ol_device_handle_t> &Devices) {
+  for (ol_device_handle_t &Device : Devices) {
+    bool IsValid = false;
+    OFFLOAD_ERR(
+        olIsValidBinary(Device, Binary.data(), Binary.size(), &IsValid));
+    if (!IsValid)
+      continue;
+
+    ol_program_handle_t Program;
+    OFFLOAD_ERR(
+        olCreateProgram(Device, Binary.data(), Binary.size(), &Program));
+    return Program;
+  }
+  handleError(
+      createStringError("No valid device found for '%s'", File.c_str()));
+}
+
+template <typename Args>
+void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                  ol_program_handle_t Program, const char *Name,
+                  ol_kernel_launch_size_args_t LaunchArgs, Args KernelArgs) {
+  ol_symbol_handle_t Kernel;
+  OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+  OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs,
+                             std::is_empty_v<Args> ? 0 : sizeof(Args),
+                             &LaunchArgs));
 }
 
 int main(int argc, const char **argv, const char **envp) {
   sys::PrintStackTraceOnErrorSignal(argv[0]);
-  cl::HideUnrelatedOptions(loader_category);
+  cl::HideUnrelatedOptions(LoaderCategory);
   cl::ParseCommandLineOptions(
       argc, argv,
       "A utility used to launch unit tests built for a GPU target. This is\n"
       "intended to provide an intrface simular to cross-compiling emulators\n");
 
-  if (help) {
+  if (Help) {
     cl::PrintHelpMessage();
     return EXIT_SUCCESS;
   }
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
-      MemoryBuffer::getFileOrSTDIN(file);
-  if (std::error_code ec = image_or_err.getError())
-    report_error(errorCodeToError(ec));
-  MemoryBufferRef image = **image_or_err;
-
-  SmallVector<const char *> new_argv = {file.c_str()};
-  llvm::transform(args, std::back_inserter(new_argv),
-                  [](const std::string &arg) { return arg.c_str(); });
-
-  Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
-      llvm::object::ELF64LEObjectFile::create(image);
-  if (!elf_or_err)
-    report_error(elf_or_err.takeError());
-
-  int ret = 1;
-  if (elf_or_err->getArch() == Triple::amdgcn) {
-#ifdef AMDHSA_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
-                      const_cast<char *>(image.getBufferStart()),
-                      image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else if (elf_or_err->getArch() == Triple::nvptx64) {
-#ifdef NVPTX_SUPPORT
-    LaunchParameters params{threads_x, threads_y, threads_z,
-                            blocks_x,  blocks_y,  blocks_z};
-
-    ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
-                     const_cast<char *>(image.getBufferStart()),
-                     image.getBufferSize(), params, print_resource_usage);
-#else
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-#endif
-  } else {
-    report_error(createStringError(
-        "Unsupported architecture; %s",
-        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
-  }
+  if (Error Err = loadLLVMOffload())
+    handleError(std::move(Err));
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
+      MemoryBuffer::getFileOrSTDIN(File);
+  if (std::error_code EC = ImageOrErr.getError())
+    handleError(errorCodeToError(EC));
+  MemoryBufferRef Image = **ImageOrErr;
+
+  SmallVector<const char *> NewArgv = {File.c_str()};
+  llvm::transform(Args, std::back_inserter(NewArgv),
+                  [](const std::string &Arg) { return Arg.c_str(); });
+
+  OFFLOAD_ERR(olInit());
+  ol_device_handle_t Device = findDevice(Image);
+  ol_device_handle_t Host = getHostDevice();
+
+  ol_program_handle_t Program;
+  OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
+                              Image.getBufferSize(), &Program));
+
+  ol_queue_handle_t Queue;
+  OFFLOAD_ERR(olCreateQueue(Device, &Queue));
+
+  int DevArgc = static_cast<int>(NewArgv.size());
----------------
sarnex wrote:

are we casting down to `int` because of some limitation with `size_t`?

https://github.com/llvm/llvm-project/pull/162739