[Openmp-commits] [openmp] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Sun Oct 29 11:25:59 PDT 2023
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/70383
>From 09f82bed396205b97bb8854ffbc655c1c5fe2a6d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 25 Oct 2023 16:46:01 -0700
Subject: [PATCH] [OpenMP] Associate the KernelEnvironment with the
GenericKernelTy
By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user provided bounds that were passed via attributes.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 4 +-
.../plugins-nextgen/amdgpu/src/rtl.cpp | 8 +-
.../PluginInterface/PluginInterface.cpp | 74 +++++++------------
.../common/PluginInterface/PluginInterface.h | 39 +++++-----
.../plugins-nextgen/cuda/src/rtl.cpp | 8 +-
.../generic-elf-64bit/src/rtl.cpp | 20 ++---
.../test/offloading/default_thread_limit.c | 3 +-
.../test/offloading/thread_state_1.c | 4 +-
.../test/offloading/thread_state_2.c | 4 +-
9 files changed, 67 insertions(+), 97 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3e4e030f44c7fe0..b320d77652e1cba 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
Function *Kernel = Builder.GetInsertBlock()->getParent();
- /// Manifest the launch configuration in the metadata matching the kernel
- /// environment.
+ // Manifest the launch configuration in the metadata matching the kernel
+ // environment.
if (MinTeamsVal > 1 || MaxTeamsVal > 0)
writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 756c5003b0d542c..5366fad0c862e7d 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -411,8 +411,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
/// generic kernel class.
struct AMDGPUKernelTy : public GenericKernelTy {
/// Create an AMDGPU kernel with a name and an execution mode.
- AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
- : GenericKernelTy(Name, ExecutionMode) {}
+ AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
/// Initialize the AMDGPU kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -1978,14 +1977,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Allocate and construct an AMDGPU kernel.
Expected<GenericKernelTy &>
- constructKernel(const __tgt_offload_entry &KernelEntry,
- OMPTgtExecModeFlags ExecMode) override {
+ constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
if (!AMDGPUKernel)
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
- new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
+ new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
return *AMDGPUKernel;
}
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 0243f0205dbf0e5..e5ee3840a676886 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
ImagePtr = &Image;
- PreferredNumThreads = GenericDevice.getDefaultNumThreads();
+ // Retrieve kernel environment object for the kernel.
+ GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
+ sizeof(KernelEnvironment), &KernelEnvironment);
+ GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+ if (auto Err =
+ GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
+ [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+ DP("Failed to read kernel environment for '%s': %s\n"
+ "Using default SPMD (2) execution mode\n",
+ Name, ErrStr.data());
+ KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
+ KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
+ KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
+ }
- MaxNumThreads = GenericDevice.getThreadLimit();
+ // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
+ MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
+ ? std::min(KernelEnvironment.Configuration.MaxThreads,
+ int32_t(GenericDevice.getThreadLimit()))
+ : GenericDevice.getThreadLimit();
+
+ // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
+ PreferredNumThreads =
+ KernelEnvironment.Configuration.MinThreads > 0
+ ? std::max(KernelEnvironment.Configuration.MinThreads,
+ int32_t(GenericDevice.getDefaultNumThreads()))
+ : GenericDevice.getDefaultNumThreads();
return initImpl(GenericDevice, Image);
}
@@ -890,13 +914,8 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
__tgt_offload_entry &DeviceEntry) {
DeviceEntry = KernelEntry;
- // Retrieve the execution mode.
- auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
- if (!ExecModeOrErr)
- return ExecModeOrErr.takeError();
-
// Create a kernel object.
- auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
+ auto KernelOrErr = constructKernel(KernelEntry);
if (!KernelOrErr)
return KernelOrErr.takeError();
@@ -914,45 +933,6 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
return Plugin::success();
}
-Expected<KernelEnvironmentTy>
-GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name,
- DeviceImageTy &Image) {
- // Create a metadata object for the kernel environment object.
- StaticGlobalTy<KernelEnvironmentTy> KernelEnv(Name.data(),
- "_kernel_environment");
-
- // Retrieve kernel environment object for the kernel.
- GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
- if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv))
- return std::move(Err);
-
- return KernelEnv.getValue();
-}
-
-Expected<OMPTgtExecModeFlags>
-GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
- DeviceImageTy &Image) {
- auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image);
- if (!KernelEnvOrError) {
- [[maybe_unused]] std::string ErrStr =
- toString(KernelEnvOrError.takeError());
- DP("Failed to read kernel environment for '%s': %s\n"
- "Using default SPMD (2) execution mode\n",
- Name.data(), ErrStr.data());
- return OMP_TGT_EXEC_MODE_SPMD;
- }
-
- auto &KernelEnv = *KernelEnvOrError;
- auto ExecMode = KernelEnv.Configuration.ExecMode;
-
- // Check that the retrieved execution mode is valid.
- if (!GenericKernelTy::isValidExecutionMode(ExecMode))
- return Plugin::error("Invalid execution mode %d for '%s'", ExecMode,
- Name.data());
-
- return ExecMode;
-}
-
Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
size_t Size, bool ExternallyLocked) {
// Insert the new entry into the map.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index ddcf3b3cc9b9537..e61b28b46267757 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -255,9 +255,8 @@ class DeviceImageTy {
/// implement the necessary virtual function members.
struct GenericKernelTy {
/// Construct a kernel with a name and a execution mode.
- GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
- : Name(Name), ExecutionMode(ExecutionMode), PreferredNumThreads(0),
- MaxNumThreads(0) {}
+ GenericKernelTy(const char *Name)
+ : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
virtual ~GenericKernelTy() {}
@@ -285,6 +284,11 @@ struct GenericKernelTy {
return *ImagePtr;
}
+ /// Return the kernel environment object for kernel \p Name.
+ const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
+ return KernelEnvironment;
+ }
+
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
switch (ExecutionMode) {
@@ -299,7 +303,7 @@ struct GenericKernelTy {
protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
- switch (ExecutionMode) {
+ switch (KernelEnvironment.Configuration.ExecMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
@@ -343,19 +347,20 @@ struct GenericKernelTy {
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
bool isGenericSPMDMode() const {
- return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_GENERIC_SPMD;
}
bool isGenericMode() const {
- return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_GENERIC;
+ }
+ bool isSPMDMode() const {
+ return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
}
- bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
/// The kernel name.
const char *Name;
- /// The execution flags of the kernel.
- OMPTgtExecModeFlags ExecutionMode;
-
/// The image that contains this kernel.
DeviceImageTy *ImagePtr = nullptr;
@@ -365,6 +370,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
+
+ /// The kernel environment, including execution flags.
+ KernelEnvironmentTy KernelEnvironment;
};
/// Class representing a map of host pinned allocations. We track these pinned
@@ -819,8 +827,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Allocate and construct a kernel object.
virtual Expected<GenericKernelTy &>
- constructKernel(const __tgt_offload_entry &KernelEntry,
- OMPTgtExecModeFlags ExecMode) = 0;
+ constructKernel(const __tgt_offload_entry &KernelEntry) = 0;
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
@@ -864,10 +871,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
protected:
- /// Return the execution mode used for kernel \p Name.
- virtual Expected<OMPTgtExecModeFlags>
- getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);
-
/// Environment variables defined by the LLVM OpenMP implementation
/// regarding the initial number of streams and events.
UInt32Envar OMPX_InitialNumStreams;
@@ -916,10 +919,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
#endif
private:
- /// Return the kernel environment object for kernel \p Name.
- Expected<KernelEnvironmentTy>
- getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
-
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 431e34ca75cd652..d3375b5a556bd8e 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -79,8 +79,7 @@ struct CUDADeviceImageTy : public DeviceImageTy {
/// generic kernel class.
struct CUDAKernelTy : public GenericKernelTy {
/// Create a CUDA kernel with a name and an execution mode.
- CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
- : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+ CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
/// Initialize the CUDA kernel.
Error initImpl(GenericDeviceTy &GenericDevice,
@@ -356,14 +355,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
/// Allocate and construct a CUDA kernel.
Expected<GenericKernelTy &>
- constructKernel(const __tgt_offload_entry &KernelEntry,
- OMPTgtExecModeFlags ExecMode) override {
+ constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the CUDA kernel.
CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
if (!CUDAKernel)
return Plugin::error("Failed to allocate memory for CUDA kernel");
- new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
+ new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
return *CUDAKernel;
}
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 619f4dfed9b4e63..85cf9bef1543b2a 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/DynamicLibrary.h"
@@ -51,8 +52,7 @@ using llvm::sys::DynamicLibrary;
/// Class implementing kernel functionalities for GenELF64.
struct GenELF64KernelTy : public GenericKernelTy {
/// Construct the kernel with a name and an execution mode.
- GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
- : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+ GenELF64KernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
/// Initialize the kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -71,6 +71,10 @@ struct GenELF64KernelTy : public GenericKernelTy {
// Save the function pointer.
Func = (void (*)())Global.getPtr();
+ KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_GENERIC;
+ KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2;
+ KernelEnvironment.Configuration.UseGenericStateMachine = /* Unknown */ 2;
+
// Set the maximum number of threads to a single.
MaxNumThreads = 1;
return Plugin::success();
@@ -137,15 +141,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
/// Construct the kernel for a specific image on the device.
Expected<GenericKernelTy &>
- constructKernel(const __tgt_offload_entry &KernelEntry,
- OMPTgtExecModeFlags ExecMode) override {
+ constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the kernel.
GenELF64KernelTy *GenELF64Kernel =
Plugin::get().allocate<GenELF64KernelTy>();
if (!GenELF64Kernel)
return Plugin::error("Failed to allocate memory for GenELF64 kernel");
- new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
+ new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);
return *GenELF64Kernel;
}
@@ -325,13 +328,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
}
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
-protected:
- /// Retrieve the execution mode for kernels. All kernels use the generic mode.
- Expected<OMPTgtExecModeFlags>
- getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
- return OMP_TGT_EXEC_MODE_GENERIC;
- }
-
private:
/// Grid values for Generic ELF64 plugins.
static constexpr GV GenELF64GridValues = {
diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
index 73c7e08ccaed498..d32e7df418cbbd0 100644
--- a/openmp/libomptarget/test/offloading/default_thread_limit.c
+++ b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -48,8 +48,7 @@ int main() {
for (int i = 0; i < N; ++i) {
optnone();
}
-// FIXME: Use the attribute value to imply a thread_limit
-// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
diff --git a/openmp/libomptarget/test/offloading/thread_state_1.c b/openmp/libomptarget/test/offloading/thread_state_1.c
index f3f7b32eead5645..908b71638097fa8 100644
--- a/openmp/libomptarget/test/offloading/thread_state_1.c
+++ b/openmp/libomptarget/test/offloading/thread_state_1.c
@@ -26,8 +26,8 @@ int main() {
}
}
}
- if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
- i_nt == 1) {
+ if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+ i_tid == 0 && i_nt == 1) {
// CHECK: Success
printf("Success\n");
return 0;
diff --git a/openmp/libomptarget/test/offloading/thread_state_2.c b/openmp/libomptarget/test/offloading/thread_state_2.c
index 6d3bf1661f46228..38bc86b7ad0c4aa 100644
--- a/openmp/libomptarget/test/offloading/thread_state_2.c
+++ b/openmp/libomptarget/test/offloading/thread_state_2.c
@@ -28,8 +28,8 @@ int main() {
}
}
}
- if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
- i_nt == 1) {
+ if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+ i_tid == 0 && i_nt == 1) {
// CHECK: Success
printf("Success\n");
return 0;
More information about the Openmp-commits
mailing list