[llvm] [Offload] Use flat array for cuLaunchKernel (PR #95116)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 11 06:56:31 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Johannes Doerfert (jdoerfert)
<details>
<summary>Changes</summary>
We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.
---
Full diff: https://github.com/llvm/llvm-project/pull/95116.diff
6 Files Affected:
- (modified) offload/include/Shared/APITypes.h (+10)
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+14-12)
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+8-6)
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+14-14)
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+19-10)
- (modified) offload/plugins-nextgen/host/src/rtl.cpp (+3-2)
``````````diff
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index e8fc27785b6c2..f3948a32ada8b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -112,6 +112,16 @@ static_assert(sizeof(KernelArgsTy) ==
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
"Invalid struct size");
+
+/// Flat array of kernel launch parameters and their total size.
+struct KernelLaunchParamsTy {
+ /// Size of the Data array.
+ size_t Size = 0;
+ /// Flat array of kernel parameters.
+ void *Data = nullptr;
+ /// Ptrs to the Data entries. Only strictly required for the host plugin.
+ void **Ptrs = nullptr;
+};
}
#endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c6dd954746e4a..43e0bbd85a9d3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -20,6 +20,7 @@
#include <unistd.h>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/Utils.h"
@@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
@@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs,
- /*Args=*/nullptr, AsyncInfoWrapper))
+ if (auto Err =
+ AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -3266,18 +3269,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
- const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
- if (ArgsSize < KernelArgsSize)
+ if (ArgsSize < LaunchParams.Size)
return Plugin::error("Mismatch of kernel arguments size");
// The args size reported by HSA may or may not contain the implicit args.
// For now, assume that HSA does not consider the implicit arguments when
// reporting the arguments of a kernel. In the worst case, we can waste
// 56 bytes per allocation.
- uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
+ uint32_t AllArgsSize = LaunchParams.Size + ImplicitArgsSize;
AMDGPUPluginTy &AMDGPUPlugin =
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
@@ -3302,7 +3305,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
- advanceVoidPtr(AllArgs, KernelArgsSize));
+ advanceVoidPtr(AllArgs, LaunchParams.Size));
// Initialize the implicit arguments to zero.
std::memset(ImplArgs, 0, ImplicitArgsSize);
@@ -3310,9 +3313,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
- if (KernelArgs.NumArgs)
- std::memcpy(AllArgs, *static_cast<void **>(Args),
- sizeof(void *) * KernelArgs.NumArgs);
+ if (LaunchParams.Size)
+ std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index eda6a4fd541e9..37d16ae3a7027 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
#include <shared_mutex>
#include <vector>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/EnvironmentVar.h"
@@ -265,7 +266,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
- void *Args,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
@@ -326,11 +327,12 @@ struct GenericKernelTy {
private:
/// Prepare the arguments before launching the kernel.
- void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
- ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
- llvm::SmallVectorImpl<void *> &Args,
- llvm::SmallVectorImpl<void *> &Ptrs,
- KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
+ KernelLaunchParamsTy
+ prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
+ llvm::SmallVectorImpl<void *> &Args,
+ llvm::SmallVectorImpl<void *> &Ptrs,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 913721a15d713..00e12aecf7512 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -268,9 +268,9 @@ struct RecordReplayTy {
OS.close();
}
- void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
- uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
- uint64_t LoopTripCount) {
+ void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
+ int32_t NumArgs, uint64_t NumTeamsClause,
+ uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -283,7 +283,7 @@ struct RecordReplayTy {
json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
- JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+ JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
json::Array JsonArgOffsets;
@@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
- void *KernelArgsPtr =
+ KernelLaunchParamsTy LaunchParams =
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
Ptrs, *KernelLaunchEnvOrErr);
@@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (RecordReplay.isRecording()) {
RecordReplay.saveImage(getName(), getImage());
RecordReplay.saveKernelInput(getName(), getImage());
- RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
+ RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
}
@@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
return Err;
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- KernelArgsPtr, AsyncInfoWrapper);
+ LaunchParams, AsyncInfoWrapper);
}
-void *GenericKernelTy::prepareArgs(
+KernelLaunchParamsTy GenericKernelTy::prepareArgs(
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
@@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
NumArgs += KLEOffset;
if (NumArgs == 0)
- return nullptr;
+ return KernelLaunchParamsTy{};
Args.resize(NumArgs);
Ptrs.resize(NumArgs);
if (KernelLaunchEnvironment) {
- Ptrs[0] = KernelLaunchEnvironment;
- Args[0] = &Ptrs[0];
+ Args[0] = KernelLaunchEnvironment;
+ Ptrs[0] = &Args[0];
}
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
- Ptrs[I] =
+ Args[I] =
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
- Args[I] = &Ptrs[I];
+ Ptrs[I] = &Args[I];
}
- return &Args[0];
+ return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
}
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b260334baa18b..a8d8846791745 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -16,6 +16,7 @@
#include <string>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
@@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
private:
@@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs, nullptr,
- AsyncInfoWrapper))
+ if (auto Err =
+ CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
@@ -1285,11 +1289,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
- CUresult Res =
- cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
- /*gridDimZ=*/1, NumThreads,
- /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
- (void **)Args, nullptr);
+ void *Config[] = {/* CU_LAUNCH_PARAM_BUFFER_POINTER */ (void *)0x01,
+ LaunchParams.Data,
+ /* CU_LAUNCH_PARAM_BUFFER_SIZE */ (void *)0x02,
+ reinterpret_cast<void *>(&LaunchParams.Size),
+ /* CU_LAUNCH_PARAM_END */ (void *)0x00};
+
+ CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
+ /*gridDimZ=*/1, NumThreads,
+ /*blockDimY=*/1, /*blockDimZ=*/1,
+ MaxDynCGroupMem, Stream, nullptr, Config);
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index ef84cbaf54588..ef2488e42c15b 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
@@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {
// Call the kernel function through libffi.
long Return;
- ffi_call(&Cif, Func, &Return, (void **)Args);
+ ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);
return Plugin::success();
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/95116
More information about the llvm-commits
mailing list