[Openmp-commits] [openmp] b75a748 - [libomptarget] Apply D110029 to amdgpu
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Thu Sep 30 13:29:45 PDT 2021
Author: Jon Chesterfield
Date: 2021-09-30T21:29:37+01:00
New Revision: b75a7481baad7657b27e54b79fec4f5e6cd4cc7a
URL: https://github.com/llvm/llvm-project/commit/b75a7481baad7657b27e54b79fec4f5e6cd4cc7a
DIFF: https://github.com/llvm/llvm-project/commit/b75a7481baad7657b27e54b79fec4f5e6cd4cc7a.diff
LOG: [libomptarget] Apply D110029 to amdgpu
Use enum for execution mode.
This is partly a port from ROCm and partly a port from D110029. Attempted to
make the same choices as ROCm as far as comments etc go to reduce the merge
conflicts.
There is some cleanup warranted here - in particular I like the cuda patch
factoring out the comparisons into named variables - but I'd like to leave
that for a follow up patch, keeping this one minimal.
Reviewed By: carlo.bertolli
Differential Revision: https://reviews.llvm.org/D110845
Added:
Modified:
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 1d2ed42eb746e..1d7980d694fa5 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -34,9 +34,9 @@
#include "omptargetplugin.h"
#include "print_tracing.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
// hostrpc interface, FIXME: consider moving to its own include these are
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
// linked as --whole-archive to override the weak symbols that are used to
@@ -107,14 +107,6 @@ struct FuncOrGblEntryTy {
std::vector<__tgt_offload_entry> Entries;
};
-enum ExecutionModeType {
- SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
- GENERIC, // everything else
- SPMD_GENERIC, // Generic kernel with SPMD execution
- NONE
-};
-
struct KernelArgPool {
private:
static pthread_mutex_t mutex;
@@ -219,18 +211,14 @@ std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
/// Use a single entity to encode a kernel and a set of flags
struct KernelTy {
- // execution mode of kernel
- // 0 - SPMD mode (without master warp)
- // 1 - Generic mode (with master warp)
- // 2 - SPMD mode execution with Generic mode semantics.
- int8_t ExecutionMode;
+ llvm::omp::OMPTgtExecModeFlags ExecutionMode;
int16_t ConstWGSize;
int32_t device_id;
void *CallStackAddr = nullptr;
const char *Name;
- KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
- void *_CallStackAddr, const char *_Name,
+ KernelTy(llvm::omp::OMPTgtExecModeFlags _ExecutionMode, int16_t _ConstWGSize,
+ int32_t _device_id, void *_CallStackAddr, const char *_Name,
uint32_t _kernarg_segment_size,
hsa_amd_memory_pool_t &KernArgMemoryPool)
: ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
@@ -1694,7 +1682,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
}
// default value GENERIC (in case symbol is missing from cubin file)
- int8_t ExecModeVal = ExecutionModeType::GENERIC;
+ llvm::omp::OMPTgtExecModeFlags ExecModeVal =
+ llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
// get flat group size if present, else Default_WG_Size
int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
@@ -1705,7 +1694,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
uint16_t Version;
uint16_t TSize;
uint16_t WG_Size;
- uint8_t Mode;
};
struct KernDescValType KernDescVal;
std::string KernDescNameStr(e->name);
@@ -1735,11 +1723,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
DP("KernDesc: Version: %d\n", KernDescVal.Version);
DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
- DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
- // Get ExecMode
- ExecModeVal = KernDescVal.Mode;
- DP("ExecModeVal %d\n", ExecModeVal);
if (KernDescVal.WG_Size == 0) {
KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
@@ -1750,43 +1734,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
} else {
DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
- // Generic
- std::string ExecModeNameStr(e->name);
- ExecModeNameStr += "_exec_mode";
- const char *ExecModeName = ExecModeNameStr.c_str();
-
- void *ExecModePtr;
- uint32_t varsize;
- err = interop_get_symbol_info((char *)image->ImageStart, img_size,
- ExecModeName, &ExecModePtr, &varsize);
-
- if (err == HSA_STATUS_SUCCESS) {
- if ((size_t)varsize != sizeof(int8_t)) {
- DP("Loading global computation properties '%s' - size mismatch(%u != "
- "%lu)\n",
- ExecModeName, varsize, sizeof(int8_t));
- return NULL;
- }
-
- memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
-
- DP("After loading global for %s ExecMode = %d\n", ExecModeName,
- ExecModeVal);
-
- if (ExecModeVal < 0 || ExecModeVal > 2) {
- DP("Error wrong exec_mode value specified in HSA code object file: "
- "%d\n",
- ExecModeVal);
- return NULL;
- }
- } else {
- DP("Loading global exec_mode '%s' - symbol missing, using default "
- "value "
- "GENERIC (1)\n",
- ExecModeName);
- }
- check("Loading computation property", err);
-
// Flat group size
std::string WGSizeNameStr(e->name);
WGSizeNameStr += "_wg_size";
@@ -1826,6 +1773,44 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
check("Loading WGSize computation property", err);
}
+ // Read execution mode from global in binary
+ std::string ExecModeNameStr(e->name);
+ ExecModeNameStr += "_exec_mode";
+ const char *ExecModeName = ExecModeNameStr.c_str();
+
+ void *ExecModePtr;
+ uint32_t varsize;
+ err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+ ExecModeName, &ExecModePtr, &varsize);
+
+ if (err == HSA_STATUS_SUCCESS) {
+ if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
+ DP("Loading global computation properties '%s' - size mismatch(%u != "
+ "%lu)\n",
+ ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
+ return NULL;
+ }
+
+ memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
+
+ DP("After loading global for %s ExecMode = %d\n", ExecModeName,
+ ExecModeVal);
+
+ if (ExecModeVal < 0 ||
+ ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
+ DP("Error wrong exec_mode value specified in HSA code object file: "
+ "%d\n",
+ ExecModeVal);
+ return NULL;
+ }
+ } else {
+ DP("Loading global exec_mode '%s' - symbol missing, using default "
+ "value "
+ "GENERIC (1)\n",
+ ExecModeName);
+ }
+ check("Loading computation property", err);
+
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
CallStackAddr, e->name, kernarg_segment_size,
DeviceInfo.KernArgPool));
@@ -1916,9 +1901,10 @@ struct launchVals {
int GridSize;
};
launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
- int ConstWGSize, int ExecutionMode, int num_teams,
- int thread_limit, uint64_t loop_tripcount,
- int DeviceNumTeams) {
+ int ConstWGSize,
+ llvm::omp::OMPTgtExecModeFlags ExecutionMode,
+ int num_teams, int thread_limit,
+ uint64_t loop_tripcount, int DeviceNumTeams) {
int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
int num_groups = 0;
@@ -1943,7 +1929,9 @@ launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
if (thread_limit > 0) {
threadsPerGroup = thread_limit;
DP("Setting threads per block to requested %d\n", thread_limit);
- if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
+ // Add master warp for GENERIC
+ if (ExecutionMode ==
+ llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
threadsPerGroup += WarpSize;
DP("Adding master wavefront: +%d threads\n", WarpSize);
}
@@ -2004,12 +1992,14 @@ launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
} else {
if (num_teams <= 0) {
if (loop_tripcount > 0) {
- if (ExecutionMode == SPMD) {
+ if (ExecutionMode ==
+ llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
// round up to the nearest integer
num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
- } else if (ExecutionMode == GENERIC) {
+ } else if (ExecutionMode ==
+ llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
num_groups = loop_tripcount;
- } else /* ExecutionMode == SPMD_GENERIC */ {
+ } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
// This is a generic kernel that was transformed to use SPMD-mode
// execution but uses Generic-mode semantics for scheduling.
num_groups = loop_tripcount;
More information about the Openmp-commits
mailing list