[Openmp-commits] [openmp] [amdgpu][openmp] Avoiding writing to packet header twice (PR #70695)
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Mon Oct 30 11:09:11 PDT 2023
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/70695
>From 53fafacba00422efa41dbc52576e215ff594ca24 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Mon, 30 Oct 2023 17:27:57 +0000
Subject: [PATCH] [amdgpu][openmp] Avoiding writing to packet header twice
---
libc/utils/gpu/loader/amdgpu/Loader.cpp | 16 +++++++++-------
.../plugins-nextgen/amdgpu/src/rtl.cpp | 15 ++++++++-------
2 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 1d0247a6dc5dca0..86a39a5a3be4d23 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -222,13 +222,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
- std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
- packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
- (params.num_blocks_z * params.num_threads_z != 1))
- << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+ (params.num_blocks_z * params.num_threads_z != 1))
+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
+ packet->reserved0 = 0;
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
@@ -236,7 +236,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
packet->kernarg_address = args;
-
+ packet->reserved2 = 0;
// Create a signal to indicate when this packet has been completed.
if (hsa_status_t err =
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
@@ -244,12 +244,14 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Initialize the packet header and set the doorbell signal to begin execution
// by the HSA runtime.
- uint16_t setup = packet->setup;
+
uint16_t header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
- __atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
+ uint32_t header_word =
+ header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
+ header_word, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
// Wait until the kernel has completed execution on the device. Periodically
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index ab24856f9bc78e4..b510922396ca4b3 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -650,8 +650,8 @@ struct AMDGPUQueueTy {
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
assert(Packet && "Invalid packet");
- // The header of the packet is written in the last moment.
- Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ // The first 32 bits of the packet are written after the other fields
+ uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
@@ -667,7 +667,7 @@ struct AMDGPUQueueTy {
Packet->completion_signal = OutputSignal->get();
// Publish the packet. Do not modify the packet after this point.
- publishKernelPacket(PacketId, Packet);
+ publishKernelPacket(PacketId, Setup, Packet);
return Plugin::success();
}
@@ -744,17 +744,17 @@ struct AMDGPUQueueTy {
/// Publish the kernel packet so that the HSA runtime can start processing
/// the kernel launch. Do not modify the packet once this function is called.
/// Assumes the queue lock is acquired.
- void publishKernelPacket(uint64_t PacketId,
+ void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
hsa_kernel_dispatch_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
- uint16_t Setup = Packet->setup;
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Publish the packet. Do not modify the package after this point.
- __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
+ uint32_t HeaderWord = Header | (Setup << 16u);
+ __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
@@ -773,7 +773,8 @@ struct AMDGPUQueueTy {
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Publish the packet. Do not modify the package after this point.
- __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
+ uint32_t HeaderWord = Header | (Setup << 16u);
+ __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
More information about the Openmp-commits
mailing list