[Openmp-commits] [openmp] ae5348a - [openmp][amdgpu] Make plugin robust to presence of explicit implicit arguments
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Mon Nov 22 15:00:50 PST 2021
Author: Jon Chesterfield
Date: 2021-11-22T23:00:20Z
New Revision: ae5348a38eb1668cd9042d9a5207dc32bc4edb87
URL: https://github.com/llvm/llvm-project/commit/ae5348a38eb1668cd9042d9a5207dc32bc4edb87
DIFF: https://github.com/llvm/llvm-project/commit/ae5348a38eb1668cd9042d9a5207dc32bc4edb87.diff
LOG: [openmp][amdgpu] Make plugin robust to presence of explicit implicit arguments
OpenMP (compiler) does not currently request any implicit kernel
arguments. OpenMP (runtime) allocates and initialises a reasonable guess at
the implicit kernel arguments anyway.
This change makes the plugin check the number of explicit arguments, instead
of all arguments, and puts the pointer to hostcall buffer in both the current
location and at the offset expected when implicit arguments are added to the
metadata by D113538.
This is intended to keep things running while fixing the oversight in the
compiler (in D113538). Once that patch lands, and a following one marks
openmp kernels that use printf such that the backend emits an args element
with the right type (instead of hidden_node), the over-allocation can be
removed and the hardcoded 8*e+3 offset replaced with one read from the
.offset of the corresponding metadata element.
Reviewed By: estewart08
Differential Revision: https://reviews.llvm.org/D114274
Added:
Modified:
openmp/libomptarget/plugins/amdgpu/impl/internal.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
index bdac98c1e101f..fe974d3676f58 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -54,7 +54,8 @@ typedef struct atl_kernel_info_s {
uint32_t sgpr_spill_count;
uint32_t vgpr_spill_count;
uint32_t kernel_segment_size;
- uint32_t num_args;
+ uint32_t explicit_argument_count;
+ uint32_t implicit_argument_count;
} atl_kernel_info_t;
typedef struct atl_symbol_info_s {
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index 4d661dc381006..6dd464ec1f7b0 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -381,7 +381,7 @@ static hsa_status_t get_code_object_custom_metadata(
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
- atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+ atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
@@ -446,8 +446,6 @@ static hsa_status_t get_code_object_custom_metadata(
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
- info.num_args = argsSize;
-
for (size_t i = 0; i < argsSize; ++i) {
KernelArgMD lcArg;
@@ -476,8 +474,10 @@ static hsa_status_t get_code_object_custom_metadata(
// check if the arg is a hidden/implicit arg
// this logic assumes that all hidden args are 8-byte aligned
if (!isImplicit(lcArg.valueKind_)) {
+ info.explicit_argument_count++;
kernel_explicit_args_size += lcArg.size_;
} else {
+ info.implicit_argument_count++;
hasHiddenArgs = true;
}
kernel_explicit_args_size += padding;
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 71321be3e83c0..45d94765936ab 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -2071,7 +2071,7 @@ int32_t __tgt_rtl_run_target_team_region_locked(
const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
- assert(arg_num == (int)KernelInfoEntry.num_args);
+ assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
/*
* Set limit based on ThreadsPerGroup and GroupsPerDevice
@@ -2173,14 +2173,31 @@ int32_t __tgt_rtl_run_target_team_region_locked(
// under a multiple reader lock, not a writer lock.
static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&hostcall_init_lock);
- impl_args->hostcall_ptr = hostrpc_assign_buffer(
+ unsigned long buffer = hostrpc_assign_buffer(
DeviceInfo.HSAAgents[device_id], queue, device_id);
pthread_mutex_unlock(&hostcall_init_lock);
- if (!impl_args->hostcall_ptr) {
+ if (!buffer) {
DP("hostrpc_assign_buffer failed, gpu would dereference null and "
"error\n");
return OFFLOAD_FAIL;
}
+
+ if (KernelInfoEntry.implicit_argument_count >= 4) {
+ // Initialise pointer for implicit_argument_count != 0 ABI
+ // Guess that the right implicit argument is at offset 24 after
+ // the explicit arguments. In the future, should be able to read
+ // the offset from msgpack. Clang is not annotating it at present.
+ uint64_t Offset =
+ sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
+ if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
+ DP("Bad offset of hostcall, exceeds kernarg segment size\n");
+ } else {
+ memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
+ }
+ }
+
+ // initialise pointer for implicit_argument_count == 0 ABI
+ impl_args->hostcall_ptr = buffer;
}
packet->kernarg_address = kernarg;
More information about the Openmp-commits
mailing list