[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alexey Sachkov via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 06:23:40 PDT 2025
================
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+ std::vector<uint32_t> ConstantIds;
+ std::vector<const void *> ConstantValues;
+
+public:
+ SpecConstantsTy() = default;
+ SpecConstantsTy(const SpecConstantsTy &) = delete;
+ SpecConstantsTy(SpecConstantsTy &&) = delete;
+ SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+ SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+ SpecConstantsTy(const SpecConstantsTy &&Other)
+ : ConstantIds(std::move(Other.ConstantIds)),
+ ConstantValues(std::move(Other.ConstantValues)) {}
+
+ ~SpecConstantsTy() {
+ for (auto I : ConstantValues) {
+ const char *ValuePtr = reinterpret_cast<const char *>(I);
+ delete[] ValuePtr;
+ }
+ }
+
+ template <typename T> void addConstant(uint32_t Id, T Val) {
+ const size_t ValSize = sizeof(Val);
+ char *ValuePtr = new char[ValSize];
+ *reinterpret_cast<T *>(ValuePtr) = Val;
+
+ ConstantIds.push_back(Id);
+ ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+ }
+
+ ze_module_constants_t getModuleConstants() const {
+ ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+ ConstantIds.data(),
+ // Unfortunately we have to const_cast it.
+ // L0 data type should probably be fixed.
+ const_cast<const void **>(ConstantValues.data())};
+ return Tmp;
+ }
+};
+#define FIXED static constexpr
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+ uint64_t UseMemoryPool : 1;
+ uint64_t Reserved : 63;
+ L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+ /// Binary flags
+ L0OptionFlagsTy Flags;
+
+ /// Staging buffer size
+ size_t StagingBufferSize = L0StagingBufferSize;
+
+ /// Staging buffer count
+ size_t StagingBufferCount = L0StagingBufferCount;
+
+ // TODO: This should probably be an array indexed by AllocKind
+ /// Memory pool parameters
+ /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+ std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+ {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+ {TARGET_ALLOC_HOST, {1, 4, 256}},
+ {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+ /// Parameters for memory pools dedicated to reduction scratch space
+ std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+ /// Oversubscription rate for normal kernels
+ FIXED uint32_t SubscriptionRate = 4;
+
+ /// Loop kernels with known ND-range may be known to have
+ /// few iterations and they may not exploit the offload device
+ /// to the fullest extent.
+ /// Let's assume a device has N total HW threads available,
+ /// and the kernel requires M hardware threads with LWS set to L.
+ /// If (M < N * ThinThreadsThreshold), then we will try
+ /// to iteratively divide L by 2 to increase the number of HW
+ /// threads used for executing the kernel. Effectively, we will
+ /// end up with L less than the kernel's SIMD width, so the HW
+ /// threads will not use all their SIMD lanes. This (presumably) should
+ /// allow more parallelism, because the stalls in the SIMD lanes
+ /// will be distributed across more HW threads, and the probability
+ /// of having a stall (or a sequence of stalls) on a critical path
+ /// in the kernel should decrease.
+ /// Anyway, this is just a heuristics that seems to work well for some
+ /// kernels (which poorly expose parallelism in the first place).
+ FIXED double ThinThreadsThreshold = 0.1;
+
+ /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
+ /// All the discard filter should be before the accept filter.
+ std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
+
+ /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+ bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
+
+ // Compilation options for IGC
+ // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+ // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+ // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+ // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+ // builtins.
+ std::string CompilationOptions = "-cl-std=CL2.0 ";
+ std::string InternalCompilationOptions = "-cl-take-global-address";
+ std::string UserCompilationOptions = "";
+
+ // Spec constants used for all modules.
+ SpecConstantsTy CommonSpecConstants;
+
+ /// Command execution mode.
+ /// Whether the runtime uses asynchronous mode or not depends on the type of
+ /// devices and whether immediate command list is fully enabled.
+ CommandModeTy CommandMode = CommandModeTy::Async;
+
+ bool Init = false; // have the options already been processed
+
+ /// Read environment variables
+ L0OptionsTy() {}
+
+ void processEnvironmentVars();
+
+ void init() {
+ if (!Init) {
+ processEnvironmentVars();
+ Init = true;
+ }
+ }
+
+ /// Parse the string and split it into tokens of string_views based on the
+ /// Delim character.
+ std::vector<std::string_view> tokenize(const std::string_view &Filter,
+ const std::string &Delim,
+ bool ProhibitEmptyTokens = false);
+
+ bool isDigits(const std::string_view &str) {
+ if (str.size() == 0)
+ return false;
+ return std::all_of(str.begin(), str.end(), ::isdigit);
+ }
+
+ bool match(const std::string &Var, const std::string &Matched) {
+ if (Var.size() != Matched.size())
+ return false;
+
+ auto equals = [](char a, char b) {
+ return std::tolower(a) == std::tolower(b);
+ };
+ return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
+ equals);
+ }
+
+ bool match(const std::string &Var, const char *Matched) {
+ std::string Str(Matched);
+ return match(Var, Str);
+ }
----------------
AlexeySachkov wrote:
`std::string` is an inefficient common denominator, `StringRef` is a better choice, especially considering that it has [`equals_insensitive`](https://llvm.org/doxygen/classllvm_1_1StringRef.html#ae46058c90a3c703357331a6501b32f1c) method
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list