[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alexey Sachkov via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 06:23:41 PDT 2025
================
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+ uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+ auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+ int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+ std::move(LaunchParams), AsyncInfoWrapper);
+ if (RC == OFFLOAD_SUCCESS)
+ return Plugin::success();
+ return Plugin::error(error::ErrorCode::UNKNOWN,
+ "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+ const auto *KernelName = getName();
+
+ auto Module = Program.findModuleFromKernelName(KernelName);
+ ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+ KernelName};
+ CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+ return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+ DeviceImageTy &Image) {
+ auto &Program = L0ProgramTy::makeL0Program(Image);
+
+ Error Err = buildKernel(Program);
+ if (Err)
+ return Err;
+ Program.addKernel(this);
+
+ return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+ int ThrLimit;
+ ThrLimit = omp_get_teams_thread_limit();
+ DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+ // omp_get_thread_limit() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t ThreadLimit =
+ (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+ ? ThrLimit
+ : 0;
+
+ int NTeams = omp_get_max_teams();
+ DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+ // omp_get_max_teams() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t NumTeams =
+ (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+ : 0;
+
+ return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+ TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool IsTeamsNDRange) const {
+
+ const KernelPropertiesTy &KernelPR = getProperties();
+
+ const auto DeviceId = Device.getDeviceId();
+ bool MaxGroupSizeForced = false;
+ bool MaxGroupCountForced = false;
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+ const auto &Option = LevelZeroPluginTy::getOptions();
+ const auto OptSubscRate = Option.SubscriptionRate;
+
+ uint32_t SIMDWidth = KernelPR.SIMDWidth;
+ uint32_t KernelWidth = KernelPR.Width;
+ uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+ if (KernelMaxThreadGroupSize < MaxGroupSize) {
+ MaxGroupSize = KernelMaxThreadGroupSize;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Capping maximum team size to %" PRIu32
+ " due to kernel constraints.\n",
+ MaxGroupSize);
+ }
+
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t MaxGroupCount = 0;
+ if (NumTeams > 0) {
+ MaxGroupCount = NumTeams;
+ MaxGroupCountForced = true;
+ }
+
+ if (MaxGroupCountForced) {
+ // If number of teams is specified by the user, then use KernelWidth
+ // WIs per WG by default, so that it matches
+ // decideLoopKernelGroupArguments() behavior.
+ if (!MaxGroupSizeForced) {
+ MaxGroupSize = KernelWidth;
+ }
+ } else {
+ const uint32_t NumSubslices = Device.getNumSubslices();
+ uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+ if (HalfNumThreads)
+ NumThreadsPerSubslice /= 2;
+
+ MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+ if (MaxGroupSizeForced) {
+ // Set group size for the HW capacity
+ uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+ uint32_t NumGroupsPerSubslice =
+ (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ } else {
+ assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+ assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+ "Invalid maxGroupSize");
+ // Maximize group size
+ while (MaxGroupSize >= KernelWidth) {
+ uint32_t NumThreadsPerGroup =
+ (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+ if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+ uint32_t NumGroupsPerSubslice =
+ NumThreadsPerSubslice / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ break;
+ }
+ MaxGroupSize -= KernelWidth;
+ }
+ }
+ }
+
+ uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ bool UsedReductionSubscriptionRate = false;
+ if (!MaxGroupCountForced) {
+ { GRPCounts[0] *= OptSubscRate; }
+
+ size_t LoopTripcount = 0;
+ if (LoopLevels) {
+ // TODO: consider other possible LoopDesc uses
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop desciptor provided but specific ND-range is disabled\n");
+ // TODO: get rid of this constraint
+ if (LoopLevels->NumLoops > 1) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+ LoopLevels->NumLoops);
+ } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+ LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+ LoopLevels->Levels[0].Stride) /
+ LoopLevels->Levels[0].Stride;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+ " = %zu\n",
+ LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+ LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+ LoopTripcount);
+ }
+ }
+
+ if (LoopTripcount && !UsedReductionSubscriptionRate) {
+ const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+ Device.getNumSubslices() * SIMDWidth;
+ size_t AdjustedGroupCount =
+ IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+ MaxTotalThreads / GRPSizes[0])
+ : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+ AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+ AdjustedGroupCount *= OptSubscRate;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Adjusting number of teams using the loop tripcount\n");
+ if (AdjustedGroupCount < GRPCounts[0])
+ GRPCounts[0] = AdjustedGroupCount;
+ }
+ }
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+ const uint32_t (&GroupSizes)[3],
+ uint32_t SIMDWidth) {
+ uint64_t GroupCount[3];
+ for (int I = 0; I < 3; ++I) {
+ if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[I] =
+ (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+ if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+ return (std::numeric_limits<uint64_t>::max)();
+ }
+ for (int I = 1; I < 3; ++I) {
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[0] *= GroupCount[I];
+ }
+ // Multiplication of the group sizes must never overflow uint64_t
+ // for any existing device.
+ uint64_t LocalWorkSize =
+ uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+ uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+ // Check that the total number of threads fits uint64_t.
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+ return (std::numeric_limits<uint64_t>::max)();
+
+ return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+ uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool &AllowCooperative) const {
+
+ const auto DeviceId = Device.getDeviceId();
+ const auto &Options = LevelZeroPluginTy::getOptions();
+ const auto &KernelPR = getProperties();
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+ bool MaxGroupSizeForced = false;
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t GRPCounts[3] = {1, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ TgtLoopDescTy *Levels = LoopLevels->Levels;
+ int32_t DistributeDim = LoopLevels->DistributeDim;
+ assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+ "Invalid distribute dimension.");
+ int32_t NumLoops = LoopLevels->NumLoops;
+ assert((NumLoops > 0 && NumLoops <= 3) &&
+ "Invalid loop nest description for ND partitioning");
+
+ // Compute global widths for X/Y/Z dimensions.
+ size_t TripCounts[3] = {1, 1, 1};
+
+ for (int32_t I = 0; I < NumLoops; I++) {
+ assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+ ", Stride = %" PRId64 "\n",
+ I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+ if (Levels[I].Ub < Levels[I].Lb)
+ TripCounts[I] = 0;
+ else
+ TripCounts[I] =
+ (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+ }
+
+ // Check if any of the loop has zero iterations.
+ if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+ std::fill(GroupSizes, GroupSizes + 3, 1);
+ std::fill(GRPCounts, GRPCounts + 3, 1);
+ if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+ // There is a distribute dimension, and the distribute loop
+ // has non-zero iterations, but some inner parallel loop
+ // has zero iterations. We still want to split the distribute
+ // loop's iterations between many WGs (of size 1), but the inner/lower
+ // dimensions should be 1x1.
+ // Note that this code is currently dead, because we are not
+ // hoisting the inner loops' bounds outside of the target regions.
+ // The code is here just for completeness.
+ size_t DistributeTripCount = TripCounts[DistributeDim];
+ if (DistributeTripCount > UINT32_MAX) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Invalid number of teams %zu due to large loop trip count\n",
+ DistributeTripCount);
+ return OFFLOAD_FAIL;
+ }
+ GRPCounts[DistributeDim] = DistributeTripCount;
+ }
+ AllowCooperative = false;
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ return OFFLOAD_SUCCESS;
+ }
+
+ if (!MaxGroupSizeForced) {
+ // Use zeKernelSuggestGroupSize to compute group sizes,
+ // or fallback to setting dimension 0 width to SIMDWidth.
+ // Note that in case of user-specified LWS GRPSizes[0]
+ // is already set according to the specified value.
+ size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+ if (DistributeDim > 0) {
+ // There is a distribute dimension.
+ GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+ GlobalSizes[DistributeDim] = 1;
+ }
+
+ {
+ if (MaxGroupSize > KernelPR.Width) {
+ GRPSizes[0] = KernelPR.Width;
+ }
+ if (DistributeDim == 0) {
+ // If there is a distribute dimension, then we do not use
+ // thin HW threads, since we do not know anything about
+ // the iteration space of the inner parallel loop regions.
+ //
+ // If there is no distribute dimension, then try to use thiner
+ // HW threads to get more independent HW threads executing
+ // the kernel - this may allow more parallelism due to
+ // the stalls being distributed across multiple HW threads rather
+ // than across SIMD lanes within one HW thread.
+ assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+ "Unexpected team sizes for dimensions 1 or/and 2.");
+ uint32_t SimdWidth = KernelPR.SIMDWidth;
+ uint64_t TotalThreads = Device.getTotalThreads();
+ TotalThreads *= Options.ThinThreadsThreshold;
+
+ uint32_t GRPSizePrev = GRPSizes[0];
+ uint64_t ThreadsNeeded =
+ computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+ while (ThreadsNeeded < TotalThreads) {
+ GRPSizePrev = GRPSizes[0];
+ // Try to half the local work size (if possible) and see
+ // how many HW threads the kernel will require with this
+ // new local work size.
+ // In most implementations the initial GRPSizes[0]
+ // will be a power-of-two.
+ if (GRPSizes[0] <= 1)
+ break;
+ GRPSizes[0] >>= 1;
+ ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+ }
+ GRPSizes[0] = GRPSizePrev;
+ }
+ }
+ }
+
+ for (int32_t I = 0; I < NumLoops; I++) {
+ if (I < DistributeDim) {
+ GRPCounts[I] = 1;
+ continue;
+ }
+ size_t Trip = TripCounts[I];
+ if (GRPSizes[I] >= Trip)
+ GRPSizes[I] = Trip;
+ size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+ if (Count > UINT32_MAX) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Invalid number of teams %zu due to large loop trip count\n", Count);
+ return OFFLOAD_FAIL;
+ }
+ GRPCounts[I] = (uint32_t)Count;
+ }
+ AllowCooperative = false;
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+ int32_t ThreadLimit, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts,
+ void *LoopDesc,
+ bool &AllowCooperative) const {
+
+ const auto SubId = SubDevice.getDeviceId();
+ const auto &KernelPR = getProperties();
+
+ // Detect if we need to reduce available HW threads. We need this adjustment
+ // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+ static std::once_flag OnceFlag;
+ static bool ZeDebugEnabled = false;
+ std::call_once(OnceFlag, []() {
+ const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
+ if (EnvVal && std::atoi(EnvVal) == 1)
+ ZeDebugEnabled = true;
+ });
+
+ // Read the most recent global thread limit and max teams.
+ auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+
+ bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+ bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+ uint32_t KernelWidth = KernelPR.Width;
+ uint32_t SIMDWidth = KernelPR.SIMDWidth;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+ assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+ if (ThreadLimit > 0) {
+ // use thread_limit clause value default
+ DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+ ThreadLimit);
+ } else if (ThreadLimitICV > 0) {
+ // else use thread-limit-var ICV
+ ThreadLimit = ThreadLimitICV;
+ DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+ }
+
+ size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+ // Set correct max group size if the kernel was compiled with explicit SIMD
+ if (SIMDWidth == 1) {
+ MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+ }
+
+ if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+ MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+ DP("Capping maximum team size to %zu due to kernel constraints.\n",
+ MaxThreadLimit);
+ }
+
+ if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+ ThreadLimit = MaxThreadLimit;
+ DP("Max team size execceds current maximum %zu. Adjusted\n",
+ MaxThreadLimit);
+ }
+ {
+ if (NumTeams > 0) {
+ DP("Number of teams is set to %" PRId32
+ "(num_teams clause or no teams construct)\n",
+ NumTeams);
+ } else if (NumTeamsICV > 0) {
+ // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+ NumTeams = NumTeamsICV;
+ DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+ NumTeams);
+ }
+
+ bool UseLoopTC = LoopDesc;
+ decideKernelGroupArguments(
+ SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+ UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+ GroupCounts, HalfNumThreads, false);
+ AllowCooperative = false;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ __tgt_async_info *AsyncInfo) const {
+ // Libomptarget can pass negative NumTeams and ThreadLimit now after
+ // introducing __tgt_target_kernel. This happens only when we have valid
+ // LoopDesc and the region is not a teams region.
+
+ auto zeKernel = getZeKernel();
+ auto DeviceId = l0Device.getDeviceId();
+ int32_t NumArgs = KernelArgs.NumArgs;
+ int32_t NumTeams = KernelArgs.NumTeams[0];
+ int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+ void *LoopDesc = nullptr;
+
+ if (NumTeams < 0)
+ NumTeams = 0;
+ if (ThreadLimit < 0)
+ ThreadLimit = 0;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+ auto &Plugin = l0Device.getPlugin();
+ auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+ auto *IdStr = Device.getZeIdCStr();
+ auto &Options = LevelZeroPluginTy::getOptions();
+ bool IsAsync = AsyncInfo && Device.asyncEnabled();
+ if (IsAsync && !AsyncInfo->Queue) {
+ AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+ if (!AsyncInfo->Queue)
+ IsAsync = false; // Couldn't get a queue, revert to sync
+ }
+ auto *AsyncQueue =
+ IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+ // We need to get a non-const version of the Properties structure in order to
+ // use its lock and be able to cache the group params and indirect flags
+ auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+ // Protect from kernel preparation to submission as kernels are shared.
+ std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+ // Decide group sizes and counts
+ uint32_t GroupSizes[3];
+ ze_group_count_t GroupCounts;
+
+ bool AllowCooperative = false;
+
+ // Check if we can reuse previous group parameters
+ bool GroupParamsReused = KernelPR.reuseGroupParams(
+ static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+ GroupSizes, GroupCounts, AllowCooperative);
+
+ if (!GroupParamsReused) {
+ auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+ GroupCounts, LoopDesc, AllowCooperative);
+
+ if (RC != OFFLOAD_SUCCESS) {
+ return RC;
+ }
+
+ KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+ NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+ AllowCooperative);
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+ GroupSizes[1], GroupSizes[2]);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+ GroupCounts.groupCountX, GroupCounts.groupCountY,
+ GroupCounts.groupCountZ);
+ for (int32_t I = 0; I < NumArgs; I++) {
+ {
+ void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+ CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+ Arg == nullptr ? nullptr : &Arg);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+ ") was set successfully for device %s.\n",
+ I, DPxPTR(Arg), IdStr);
+ }
+ }
+
+ // Set Kernel Indirect flags
+ auto &PrevFlags = KernelPR.IndirectAccessFlags;
+ ze_kernel_indirect_access_flags_t Flags = 0;
+ Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+ Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+ if (PrevFlags != Flags) {
+ // Combine with common access flags
+ const auto FinalFlags = Device.getIndirectFlags() | Flags;
+ CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+ DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+ PrevFlags = Flags;
+ }
+
+ if (!GroupParamsReused) {
+ CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+ GroupSizes[1], GroupSizes[2]);
+ }
+
+ ze_command_list_handle_t CmdList = nullptr;
+ ze_command_queue_handle_t CmdQueue = nullptr;
+ const bool UseImmCmdList = Device.useImmForCompute();
+
+ if (UseImmCmdList) {
+ CmdList = Device.getImmCmdList();
+ // Command queue is not used with immediate command list
+ } else {
+ CmdList = Device.getCmdList();
+ CmdQueue = Device.getCmdQueue();
+ }
+
+ if (UseImmCmdList) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Using immediate command list for kernel submission.\n");
+ auto Event = Device.getEvent();
+ size_t NumWaitEvents = 0;
+ ze_event_handle_t *WaitEvents = nullptr;
+ if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+ if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+ NumWaitEvents = 1;
+ WaitEvents = &AsyncQueue->WaitEvents.back();
+ } else {
+ NumWaitEvents = AsyncQueue->WaitEvents.size();
+ WaitEvents = AsyncQueue->WaitEvents.data();
+ }
+ }
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+ if (AllowCooperative)
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+ zeKernel, &GroupCounts, Event, NumWaitEvents,
+ WaitEvents);
+ else
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ &GroupCounts, Event, NumWaitEvents, WaitEvents);
+ KernelLock.unlock();
+ if (IsAsync) {
+ AsyncQueue->WaitEvents.push_back(Event);
+ AsyncQueue->KernelEvent = Event;
+ } else {
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+ Device.releaseEvent(Event);
+ }
+ } else {
+ ze_event_handle_t Event = nullptr;
+ KernelLock.unlock();
+ CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+ CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+ CmdQueue, 1, &CmdList, nullptr);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+ CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+ CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+ if (Event) {
+ Device.releaseEvent(Event);
+ }
----------------
AlexeySachkov wrote:
Where does kernel submission happen on this path? I do not see any calls which would append anything into `CmdList`
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list