[llvm] [Offload] Use flat array for cuLaunchKernel (PR #95116)
Johannes Doerfert via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 07:10:07 PDT 2024
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/95116
>From 770b5b6e59d2c046a5af9367afc4bc9e7fafbcba Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 11 Jun 2024 01:49:34 -0700
Subject: [PATCH] [Offload] Use flat array for cuLaunchKernel
We already used a flat array of kernel launch parameters for the AMD GPU
launch but now we also use this scheme for the NVIDIA GPU launch. The
only remaining/required use of the indirection is the host plugin (due
ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.
---
offload/include/Shared/APITypes.h | 10 +++++++
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 28 ++++++++++---------
.../common/include/PluginInterface.h | 14 ++++++----
.../common/src/PluginInterface.cpp | 28 +++++++++----------
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 4 +++
offload/plugins-nextgen/cuda/src/rtl.cpp | 28 ++++++++++++-------
offload/plugins-nextgen/host/src/rtl.cpp | 5 ++--
7 files changed, 72 insertions(+), 45 deletions(-)
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index a84b685eeedce..5b22bbaac144f 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -116,6 +116,16 @@ static_assert(sizeof(KernelArgsTy) ==
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
"Invalid struct size");
+
+/// Flat array of kernel launch parameters and their total size.
+struct KernelLaunchParamsTy {
+ /// Size of the Data array.
+ size_t Size = 0;
+ /// Flat array of kernel parameters.
+ void *Data = nullptr;
+ /// Ptrs to the Data entries. Only strictly required for the host plugin.
+ void **Ptrs = nullptr;
+};
}
#endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 26bca4a3674bd..e678213df18ce 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -20,6 +20,7 @@
#include <unistd.h>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/Utils.h"
@@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
@@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs,
- /*Args=*/nullptr, AsyncInfoWrapper))
+ if (auto Err =
+ AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -3265,12 +3268,11 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
- const uint32_t LaunchParamsSize = KernelArgs.NumArgs * sizeof(void *);
-
- if (ArgsSize != LaunchParamsSize &&
- ArgsSize != LaunchParamsSize + getImplicitArgsSize())
+ if (ArgsSize != LaunchParams.Size &&
+ ArgsSize != LaunchParams.Size + getImplicitArgsSize())
return Plugin::error("Mismatch of kernel arguments size");
AMDGPUPluginTy &AMDGPUPlugin =
@@ -3294,10 +3296,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
return Err;
utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
- if (ArgsSize == LaunchParamsSize + getImplicitArgsSize()) {
+ if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
// Initialize implicit arguments.
ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
- advanceVoidPtr(AllArgs, LaunchParamsSize));
+ advanceVoidPtr(AllArgs, LaunchParams.Size));
// Initialize the implicit arguments to zero.
std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3306,8 +3308,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
- if (LaunchParamsSize)
- std::memcpy(AllArgs, *static_cast<void **>(Args), LaunchParamsSize);
+ if (LaunchParams.Size)
+ std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 88423be039af7..0d2a36a42d5fa 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
#include <shared_mutex>
#include <vector>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/EnvironmentVar.h"
@@ -265,7 +266,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
- void *Args,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
@@ -326,11 +327,12 @@ struct GenericKernelTy {
private:
/// Prepare the arguments before launching the kernel.
- void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
- ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
- llvm::SmallVectorImpl<void *> &Args,
- llvm::SmallVectorImpl<void *> &Ptrs,
- KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
+ KernelLaunchParamsTy
+ prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
+ llvm::SmallVectorImpl<void *> &Args,
+ llvm::SmallVectorImpl<void *> &Ptrs,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5a53c479e33d0..94f9d4670b672 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -268,9 +268,9 @@ struct RecordReplayTy {
OS.close();
}
- void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
- uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
- uint64_t LoopTripCount) {
+ void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
+ int32_t NumArgs, uint64_t NumTeamsClause,
+ uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -283,7 +283,7 @@ struct RecordReplayTy {
json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
- JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+ JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
json::Array JsonArgOffsets;
@@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
- void *KernelArgsPtr =
+ KernelLaunchParamsTy LaunchParams =
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
Ptrs, *KernelLaunchEnvOrErr);
@@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (RecordReplay.isRecording()) {
RecordReplay.saveImage(getName(), getImage());
RecordReplay.saveKernelInput(getName(), getImage());
- RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
+ RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
}
@@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
return Err;
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- KernelArgsPtr, AsyncInfoWrapper);
+ LaunchParams, AsyncInfoWrapper);
}
-void *GenericKernelTy::prepareArgs(
+KernelLaunchParamsTy GenericKernelTy::prepareArgs(
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
@@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
NumArgs += KLEOffset;
if (NumArgs == 0)
- return nullptr;
+ return KernelLaunchParamsTy{};
Args.resize(NumArgs);
Ptrs.resize(NumArgs);
if (KernelLaunchEnvironment) {
- Ptrs[0] = KernelLaunchEnvironment;
- Args[0] = &Ptrs[0];
+ Args[0] = KernelLaunchEnvironment;
+ Ptrs[0] = &Args[0];
}
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
- Ptrs[I] =
+ Args[I] =
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
- Args[I] = &Ptrs[I];
+ Ptrs[I] = &Args[I];
}
- return &Args[0];
+ return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
}
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index d65e5cf61e096..16c8f7ad46c44 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -282,6 +282,10 @@ typedef enum CUevent_flags_enum {
CU_EVENT_INTERPROCESS = 0x4
} CUevent_flags;
+static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
+static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
+static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
+
CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 62460c07415be..b6465d61bd033 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -16,6 +16,7 @@
#include <string>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
@@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
private:
@@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs, nullptr,
- AsyncInfoWrapper))
+ if (auto Err =
+ CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
@@ -1285,11 +1289,15 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
- CUresult Res =
- cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
- /*gridDimZ=*/1, NumThreads,
- /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
- (void **)Args, nullptr);
+ void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
+ CU_LAUNCH_PARAM_BUFFER_SIZE,
+ reinterpret_cast<void *>(&LaunchParams.Size),
+ CU_LAUNCH_PARAM_END};
+
+ CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
+ /*gridDimZ=*/1, NumThreads,
+ /*blockDimY=*/1, /*blockDimZ=*/1,
+ MaxDynCGroupMem, Stream, nullptr, Config);
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index aa59ea618e399..fe296b77c7d55 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
@@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {
// Call the kernel function through libffi.
long Return;
- ffi_call(&Cif, Func, &Return, (void **)Args);
+ ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);
return Plugin::success();
}
More information about the llvm-commits
mailing list