[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alexey Sachkov via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 06:23:40 PDT 2025
================
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+ uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+ auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+ int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+ std::move(LaunchParams), AsyncInfoWrapper);
+ if (RC == OFFLOAD_SUCCESS)
+ return Plugin::success();
+ return Plugin::error(error::ErrorCode::UNKNOWN,
+ "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+ const auto *KernelName = getName();
+
+ auto Module = Program.findModuleFromKernelName(KernelName);
+ ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+ KernelName};
+ CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+ return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+ DeviceImageTy &Image) {
+ auto &Program = L0ProgramTy::makeL0Program(Image);
+
+ Error Err = buildKernel(Program);
+ if (Err)
+ return Err;
+ Program.addKernel(this);
+
+ return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+ int ThrLimit;
+ ThrLimit = omp_get_teams_thread_limit();
+ DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+ // omp_get_thread_limit() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t ThreadLimit =
+ (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+ ? ThrLimit
+ : 0;
+
+ int NTeams = omp_get_max_teams();
+ DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+ // omp_get_max_teams() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t NumTeams =
+ (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+ : 0;
+
+ return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+ TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool IsTeamsNDRange) const {
+
+ const KernelPropertiesTy &KernelPR = getProperties();
+
+ const auto DeviceId = Device.getDeviceId();
+ bool MaxGroupSizeForced = false;
+ bool MaxGroupCountForced = false;
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+ const auto &Option = LevelZeroPluginTy::getOptions();
+ const auto OptSubscRate = Option.SubscriptionRate;
+
+ uint32_t SIMDWidth = KernelPR.SIMDWidth;
+ uint32_t KernelWidth = KernelPR.Width;
+ uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+ if (KernelMaxThreadGroupSize < MaxGroupSize) {
+ MaxGroupSize = KernelMaxThreadGroupSize;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Capping maximum team size to %" PRIu32
+ " due to kernel constraints.\n",
+ MaxGroupSize);
+ }
+
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t MaxGroupCount = 0;
+ if (NumTeams > 0) {
+ MaxGroupCount = NumTeams;
+ MaxGroupCountForced = true;
+ }
+
+ if (MaxGroupCountForced) {
+ // If number of teams is specified by the user, then use KernelWidth
+ // WIs per WG by default, so that it matches
+ // decideLoopKernelGroupArguments() behavior.
+ if (!MaxGroupSizeForced) {
+ MaxGroupSize = KernelWidth;
+ }
+ } else {
+ const uint32_t NumSubslices = Device.getNumSubslices();
+ uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+ if (HalfNumThreads)
+ NumThreadsPerSubslice /= 2;
+
+ MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+ if (MaxGroupSizeForced) {
+ // Set group size for the HW capacity
+ uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+ uint32_t NumGroupsPerSubslice =
+ (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ } else {
+ assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+ assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+ "Invalid maxGroupSize");
+ // Maximize group size
+ while (MaxGroupSize >= KernelWidth) {
+ uint32_t NumThreadsPerGroup =
+ (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+ if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+ uint32_t NumGroupsPerSubslice =
+ NumThreadsPerSubslice / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ break;
+ }
+ MaxGroupSize -= KernelWidth;
+ }
+ }
+ }
+
+ uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ bool UsedReductionSubscriptionRate = false;
+ if (!MaxGroupCountForced) {
+ { GRPCounts[0] *= OptSubscRate; }
+
+ size_t LoopTripcount = 0;
+ if (LoopLevels) {
+ // TODO: consider other possible LoopDesc uses
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop desciptor provided but specific ND-range is disabled\n");
+ // TODO: get rid of this constraint
+ if (LoopLevels->NumLoops > 1) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+ LoopLevels->NumLoops);
+ } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+ LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+ LoopLevels->Levels[0].Stride) /
+ LoopLevels->Levels[0].Stride;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+ " = %zu\n",
+ LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+ LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+ LoopTripcount);
+ }
+ }
+
+ if (LoopTripcount && !UsedReductionSubscriptionRate) {
+ const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+ Device.getNumSubslices() * SIMDWidth;
+ size_t AdjustedGroupCount =
+ IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+ MaxTotalThreads / GRPSizes[0])
+ : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+ AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+ AdjustedGroupCount *= OptSubscRate;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Adjusting number of teams using the loop tripcount\n");
+ if (AdjustedGroupCount < GRPCounts[0])
+ GRPCounts[0] = AdjustedGroupCount;
+ }
+ }
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+ const uint32_t (&GroupSizes)[3],
+ uint32_t SIMDWidth) {
+ uint64_t GroupCount[3];
+ for (int I = 0; I < 3; ++I) {
+ if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[I] =
+ (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+ if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+ return (std::numeric_limits<uint64_t>::max)();
+ }
+ for (int I = 1; I < 3; ++I) {
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[0] *= GroupCount[I];
+ }
+ // Multiplication of the group sizes must never overflow uint64_t
+ // for any existing device.
+ uint64_t LocalWorkSize =
+ uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+ uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+ // Check that the total number of threads fits uint64_t.
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+ return (std::numeric_limits<uint64_t>::max)();
+
+ return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+ uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool &AllowCooperative) const {
+
+ const auto DeviceId = Device.getDeviceId();
+ const auto &Options = LevelZeroPluginTy::getOptions();
+ const auto &KernelPR = getProperties();
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+ bool MaxGroupSizeForced = false;
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t GRPCounts[3] = {1, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ TgtLoopDescTy *Levels = LoopLevels->Levels;
+ int32_t DistributeDim = LoopLevels->DistributeDim;
+ assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+ "Invalid distribute dimension.");
+ int32_t NumLoops = LoopLevels->NumLoops;
+ assert((NumLoops > 0 && NumLoops <= 3) &&
+ "Invalid loop nest description for ND partitioning");
+
+ // Compute global widths for X/Y/Z dimensions.
+ size_t TripCounts[3] = {1, 1, 1};
+
+ for (int32_t I = 0; I < NumLoops; I++) {
+ assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+ ", Stride = %" PRId64 "\n",
+ I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+ if (Levels[I].Ub < Levels[I].Lb)
+ TripCounts[I] = 0;
+ else
+ TripCounts[I] =
+ (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+ }
+
+ // Check if any of the loop has zero iterations.
+ if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+ std::fill(GroupSizes, GroupSizes + 3, 1);
+ std::fill(GRPCounts, GRPCounts + 3, 1);
+ if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+ // There is a distribute dimension, and the distribute loop
+ // has non-zero iterations, but some inner parallel loop
+ // has zero iterations. We still want to split the distribute
+ // loop's iterations between many WGs (of size 1), but the inner/lower
+ // dimensions should be 1x1.
+ // Note that this code is currently dead, because we are not
+ // hoisting the inner loops' bounds outside of the target regions.
+ // The code is here just for completeness.
+ size_t DistributeTripCount = TripCounts[DistributeDim];
+ if (DistributeTripCount > UINT32_MAX) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Invalid number of teams %zu due to large loop trip count\n",
+ DistributeTripCount);
+ return OFFLOAD_FAIL;
+ }
+ GRPCounts[DistributeDim] = DistributeTripCount;
+ }
+ AllowCooperative = false;
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ return OFFLOAD_SUCCESS;
+ }
+
+ if (!MaxGroupSizeForced) {
+ // Use zeKernelSuggestGroupSize to compute group sizes,
+ // or fallback to setting dimension 0 width to SIMDWidth.
+ // Note that in case of user-specified LWS GRPSizes[0]
+ // is already set according to the specified value.
+ size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+ if (DistributeDim > 0) {
+ // There is a distribute dimension.
+ GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+ GlobalSizes[DistributeDim] = 1;
+ }
+
+ {
----------------
AlexeySachkov wrote:
Why is this extra scope needed?
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list