[Openmp-commits] [openmp] fb2c42d - [OpenMP] Improve AMDGPU Plugin
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Mon Dec 19 19:11:06 PST 2022
Author: Johannes Doerfert
Date: 2022-12-19T19:09:43-08:00
New Revision: fb2c42df41cb01e1122fd4e9c81e1f4bc5592b12
URL: https://github.com/llvm/llvm-project/commit/fb2c42df41cb01e1122fd4e9c81e1f4bc5592b12
DIFF: https://github.com/llvm/llvm-project/commit/fb2c42df41cb01e1122fd4e9c81e1f4bc5592b12.diff
LOG: [OpenMP] Improve AMDGPU Plugin
With this patch we:
- pick more sensible defaults for the number of teams, inspired by the
old plugin, and configured via LIBOMPTARGET_AMDGPU_TEAMS_PER_CU.
- check the input signal of a kernel launch late, after the queue lock
was taken, to avoid a barrier packet more often.
- copy the kernel arguments in one swoop into the appropriate memory.
- manually specialize the callbacks to avoid potential indirect calls.
Added:
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index d61b0a32ae965..93464063dfaf5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -66,6 +66,9 @@ struct GV {
/// the maximum number of teams.
unsigned GV_Max_Teams;
+ // The default number of teams in the absence of any other information.
+ unsigned GV_Default_Num_Teams;
+
// An alternative to the heavy data sharing infrastructure that uses global
// memory is one that uses device __shared__ memory. The amount of such space
// (in bytes) reserved by the OpenMP runtime is noted here.
@@ -82,21 +85,23 @@ struct GV {
/// For AMDGPU GPUs
static constexpr GV AMDGPUGridValues64 = {
- 256, // GV_Slot_Size
- 64, // GV_Warp_Size
+ 256, // GV_Slot_Size
+ 64, // GV_Warp_Size
(1 << 16), // GV_Max_Teams
- 896, // GV_SimpleBufferSize
- 1024, // GV_Max_WG_Size,
- 256, // GV_Default_WG_Size
+ 440, // GV_Default_Num_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size,
+ 256, // GV_Default_WG_Size
};
static constexpr GV AMDGPUGridValues32 = {
- 256, // GV_Slot_Size
- 32, // GV_Warp_Size
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
(1 << 16), // GV_Max_Teams
- 896, // GV_SimpleBufferSize
- 1024, // GV_Max_WG_Size,
- 256, // GV_Default_WG_Size
+ 440, // GV_Default_Num_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size,
+ 256, // GV_Default_WG_Size
};
template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
@@ -106,12 +111,13 @@ template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
/// For Nvidia GPUs
static constexpr GV NVPTXGridValues = {
- 256, // GV_Slot_Size
- 32, // GV_Warp_Size
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
(1 << 16), // GV_Max_Teams
- 896, // GV_SimpleBufferSize
- 1024, // GV_Max_WG_Size
- 128, // GV_Default_WG_Size
+ 3200, // GV_Default_Num_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size
+ 128, // GV_Default_WG_Size
};
} // namespace omp
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index c90aa04ad435d..b9c38778fbc17 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -121,6 +121,8 @@ struct AMDGPUResourceRef : public GenericDeviceResourceRef {
/// Create a reference to an existing resource.
AMDGPUResourceRef(ResourceTy *Resource) : Resource(Resource) {}
+ virtual ~AMDGPUResourceRef() {}
+
/// Create a new resource and save the reference. The reference must be empty
/// before calling to this function.
Error create(GenericDeviceTy &Device) override;
@@ -540,6 +542,10 @@ struct AMDGPUQueueTy {
// should be lightweight; do not block the thread, allocate memory, etc.
std::lock_guard<std::mutex> Lock(Mutex);
+ // Avoid defining the input dependency if already satisfied.
+ if (InputSignal && !InputSignal->load())
+ InputSignal = nullptr;
+
// Add a barrier packet before the kernel packet in case there is a pending
// preceding operation. The barrier packet will delay the processing of
// subsequent queue's packets until the barrier input signal are satisfied.
@@ -786,8 +792,18 @@ struct AMDGPUStreamTy {
return Plugin::success();
// Perform the action.
- if (auto Err = (*ActionFunction)(&ActionArgs))
- return Err;
+ if (ActionFunction == memcpyAction) {
+ if (auto Err = memcpyAction(&ActionArgs))
+ return Err;
+ } else if (ActionFunction == releaseBufferAction) {
+ if (auto Err = releaseBufferAction(&ActionArgs))
+ return Err;
+ } else if (ActionFunction == releaseSignalAction) {
+ if (auto Err = releaseSignalAction(&ActionArgs))
+ return Err;
+ } else {
+ return Plugin::error("Unknown action function!");
+ }
// Invalidate the action.
ActionFunction = nullptr;
@@ -990,10 +1006,6 @@ struct AMDGPUStreamTy {
// Consume stream slot and compute dependencies.
auto [Curr, InputSignal] = consume(OutputSignal);
- // Avoid defining the input dependency if already satisfied.
- if (InputSignal && !InputSignal->load())
- InputSignal = nullptr;
-
// Setup the post action to release the kernel args buffer.
if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
return Err;
@@ -1485,8 +1497,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices,
AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
: GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(),
- OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 8),
- OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 1024),
+ OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
+ OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
+ OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1 * 1024 * 1024), // 1MB
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -1528,10 +1541,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
hsa_dim3_t GridMaxDim;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
return Err;
+
GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
if (GridValues.GV_Max_Teams == 0)
return Plugin::error("Maximum number of teams cannot be zero");
+ // Compute the default number of teams.
+ uint32_t ComputeUnits = 0;
+ if (auto Err =
+ getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
+ return Err;
+ GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
+
// Get maximum size of any device queues and maximum number of queues.
uint32_t MaxQueueSize;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
@@ -2014,6 +2035,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// process them.
UInt32Envar OMPX_QueueSize;
+ /// Envar for controlling the default number of teams relative to the number
+ /// of compute units (CUs) the device has:
+ /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
+ UInt32Envar OMPX_DefaultTeamsPerCU;
+
/// Envar specifying the maximum size in bytes where the memory copies are
/// asynchronous operations. Up to this transfer size, the memory copies are
/// asychronous operations pushed to the corresponding stream. For larger
@@ -2226,9 +2252,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
// Classify the agents into kernel (GPU) and host (CPU) kernels.
if (DeviceType == HSA_DEVICE_TYPE_GPU) {
// Ensure that the GPU agent supports kernel dispatch packets.
- hsa_agent_feature_t features;
- Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &features);
- if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
+ hsa_agent_feature_t Features;
+ Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
+ if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
KernelAgents.push_back(Agent);
} else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
HostAgents.push_back(Agent);
@@ -2405,11 +2431,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
std::memset(ImplArgs, 0, ImplicitArgsSize);
// Copy the explicit arguments.
- for (int32_t ArgId = 0; ArgId < NumKernelArgs; ++ArgId) {
- void *Dst = (char *)AllArgs + sizeof(void *) * ArgId;
- void *Src = *((void **)KernelArgs + ArgId);
- std::memcpy(Dst, Src, sizeof(void *));
- }
+ // TODO: We should expose the args memory manager alloc to the common part as
+ // alternative to copying them twice.
+ if (NumKernelArgs)
+ std::memcpy(AllArgs, *static_cast<void **>(KernelArgs),
+ sizeof(void *) * NumKernelArgs);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper);
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 83b656acc8b9d..6ce6c29481c55 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -101,16 +101,21 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
uint64_t NumTeamsClause,
uint64_t LoopTripCount,
uint32_t NumThreads) const {
- uint64_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice);
if (NumTeamsClause > 0) {
- PreferredNumBlocks = NumTeamsClause;
- } else if (LoopTripCount > 0) {
+ // TODO: We need to honor any value and consequently allow more than the
+ // block limit. For this we might need to start multiple kernels or let the
+ // blocks start again until the requested number has been started.
+ return std::min(NumTeamsClause, GenericDevice.getBlockLimit());
+ }
+
+ uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
+ if (LoopTripCount > 0) {
if (isSPMDMode()) {
// We have a combined construct, i.e. `target teams distribute
// parallel for [simd]`. We launch so many teams so that each thread
// will execute one iteration of the loop. round up to the nearest
// integer
- PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+ TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
} else {
assert((isGenericMode() || isGenericSPMDMode()) &&
"Unexpected execution mode!");
@@ -126,9 +131,12 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
//
// Threads within a team will execute the iterations of the `parallel`
// loop.
- PreferredNumBlocks = LoopTripCount;
+ TripCountNumBlocks = LoopTripCount;
}
}
+ // If the loops are long running we rather reuse blocks than spawn too many.
+ uint64_t PreferredNumBlocks =
+ std::min(TripCountNumBlocks, getDefaultNumBlocks(GenericDevice));
return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
}
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index eeb6f69db1ace..9b89e316551db 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -372,8 +372,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return GridValues.GV_Default_WG_Size;
}
uint64_t getDefaultNumBlocks() const {
- // TODO: Introduce a default num blocks value.
- return GridValues.GV_Default_WG_Size;
+ return GridValues.GV_Default_Num_Teams;
}
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
More information about the Openmp-commits
mailing list