[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alex Duran via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 25 06:06:15 PDT 2025
================
@@ -0,0 +1,680 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+ None = 0x0000,
+ SKL = 0x1900,
+ KBL = 0x5900,
+ CFL = 0x3E00,
+ CFL_2 = 0x9B00,
+ ICX = 0x8A00,
+ TGL = 0xFF20,
+ TGL_2 = 0x9A00,
+ DG1 = 0x4900,
+ RKL = 0x4C00,
+ ADLS = 0x4600,
+ RTL = 0xA700,
+ MTL = 0x7D00,
+ PVC = 0x0B00,
+ DG2_ATS_M = 0x4F00,
+ DG2_ATS_M_2 = 0x5600,
+ LNL = 0x6400,
+ BMG = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+ DeviceArch_None = 0,
+ DeviceArch_Gen = 0x0001, // Gen 9, Gen 11 or Xe
+ DeviceArch_XeLPG = 0x0002,
+ DeviceArch_XeHPC = 0x0004,
+ DeviceArch_XeHPG = 0x0008,
+ DeviceArch_Xe2LP = 0x0010,
+ DeviceArch_Xe2HP = 0x0020,
+ DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+ ze_device_handle_t zeId;
+ int32_t RootId;
+ int32_t SubId;
+ int32_t CCSId;
+
+ L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+ int32_t CCSId = -1)
+ : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+ /// Command list for each device
+ ze_command_list_handle_t CmdList = nullptr;
+
+ /// Main copy command list for each device
+ ze_command_list_handle_t CopyCmdList = nullptr;
+
+ /// Link copy command list for each device
+ ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+ /// Command queue for each device
+ ze_command_queue_handle_t CmdQueue = nullptr;
+
+ /// Main copy command queue for each device
+ ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+ /// Link copy command queues for each device
+ ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+ /// Immediate command list for each device
+ ze_command_list_handle_t ImmCmdList = nullptr;
+
+ /// Immediate copy command list for each device
+ ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+ L0DeviceTLSTy() = default;
+ ~L0DeviceTLSTy() {
+ // assert all fields are nullptr on destruction
+ assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+ assert(CopyCmdList == nullptr &&
+ "CopyCmdList is not nullptr on destruction");
+ assert(LinkCopyCmdList == nullptr &&
+ "LinkCopyCmdList is not nullptr on destruction");
+ assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+ assert(CopyCmdQueue == nullptr &&
+ "CopyCmdQueue is not nullptr on destruction");
+ assert(LinkCopyCmdQueue == nullptr &&
+ "LinkCopyCmdQueue is not nullptr on destruction");
+ assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+ assert(ImmCopyCmdList == nullptr &&
+ "ImmCopyCmdList is not nullptr on destruction");
+ }
+
+ L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+ L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+ CmdList = std::exchange(Other.CmdList, nullptr);
+ CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+ LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+ CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+ CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+ LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+ ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+ ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+ }
+
+ void clear() {
+ // destroy all lists and queues
+ if (CmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+ if (CopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+ if (LinkCopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+ if (ImmCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+ if (ImmCopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+ if (CmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+ if (CopyCmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+ if (LinkCopyCmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+ CmdList = nullptr;
+ CopyCmdList = nullptr;
+ LinkCopyCmdList = nullptr;
+ CmdQueue = nullptr;
+ CopyCmdQueue = nullptr;
+ LinkCopyCmdQueue = nullptr;
+ ImmCmdList = nullptr;
+ ImmCopyCmdList = nullptr;
+ }
+
+ L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+ L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+ auto getCmdList() const { return CmdList; }
+ void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+ auto getCopyCmdList() const { return CopyCmdList; }
+ void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+ CopyCmdList = _CopyCmdList;
+ }
+
+ auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+ void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+ LinkCopyCmdList = _LinkCopyCmdList;
+ }
+
+ auto getImmCmdList() const { return ImmCmdList; }
+ void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+ ImmCmdList = _ImmCmdList;
+ }
+
+ auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+ void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+ ImmCopyCmdList = _ImmCopyCmdList;
+ }
+
+ auto getCmdQueue() const { return CmdQueue; }
+ void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+ CmdQueue = _CmdQueue;
+ }
+
+ auto getCopyCmdQueue() const { return CopyCmdQueue; }
+ void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+ CopyCmdQueue = _CopyCmdQueue;
+ }
+
+ auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+ void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+ LinkCopyCmdQueue = _LinkCopyCmdQueue;
+ }
+};
+
+struct L0DeviceTLSTableTy
+ : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+ void clear() {
+ PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+ }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+ // Level Zero Context for this Device
+ L0ContextTy &l0Context;
+
+ // Level Zero handle for this Device
+ ze_device_handle_t zeDevice;
+ // Device Properties
+ ze_device_properties_t DeviceProperties{};
+ ze_device_compute_properties_t ComputeProperties{};
+ ze_device_memory_properties_t MemoryProperties{};
+ ze_device_cache_properties_t CacheProperties{};
+
+ /// Devices' default target allocation kind for internal allocation
+ int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+ DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+ std::string DeviceName;
+
+ /// Common indirect access flags for this device
+ ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+ /// Device UUID for toplevel devices only
+ std::string DeviceUuid;
+
+ /// L0 Device ID as string
+ std::string zeId;
+
+ /// Command queue group ordinals for each device
+ std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+ /// Command queue group ordinals for copying
+ std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+ /// Command queue group ordinals and number of queues for link copy engines
+ std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+ /// Command queue index for each device
+ uint32_t ComputeIndex = 0;
+
+ bool IsAsyncEnabled = false;
+
+ // lock for this device
+ std::mutex Mutex;
+
+ /// Contains all modules (possibly from multiple device images) to handle
+ /// dynamic link across multiple images
+ llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+ /// L0 programs created for this device
+ std::list<L0ProgramTy> Programs;
+
+ /// MemAllocator for this device
+ MemAllocatorTy MemAllocator;
+
+ /// The current size of the global device memory pool (managed by us).
+ uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+ int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+ int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+ int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+
+ bool shouldSetupDeviceMemoryPool() const override { return false; }
+ DeviceArchTy computeArch() const;
+
+ /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+ std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+ /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+ std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+ Error internalInit();
+
+public:
+ L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+ ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+ const std::string_view zeId, int32_t ComputeIndex)
+ : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+ l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+ ComputeIndex(ComputeIndex) {
+ DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ DeviceProperties.pNext = nullptr;
+ ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+ ComputeProperties.pNext = nullptr;
+ MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+ MemoryProperties.pNext = nullptr;
+ CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+ CacheProperties.pNext = nullptr;
+
+ auto Err = internalInit();
+ if (Err)
+ FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+ toString(std::move(Err)).c_str());
+ }
+
+ static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+ return static_cast<L0DeviceTy &>(Device);
+ }
+
+ auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+ L0DeviceTLSTy &getTLS();
+
+ Error setContext() override { return Plugin::success(); }
+ Error initImpl(GenericPluginTy &Plugin) override;
+ Error deinitImpl() override {
+ Programs.clear();
+ return Plugin::success();
+ }
+
+ auto getZeDevice() const { return zeDevice; }
+
+ const L0ContextTy &getL0Context() const { return l0Context; }
+ L0ContextTy &getL0Context() { return l0Context; }
+
+ const std::string_view getName() const { return DeviceName; }
+ const char *getNameCStr() const { return DeviceName.c_str(); }
+
+ const std::string_view getZeId() const { return zeId; }
+ const char *getZeIdCStr() const { return zeId.c_str(); }
+
+ std::mutex &getMutex() { return Mutex; }
+
+ auto getComputeIndex() const { return ComputeIndex; }
+ auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+ auto getNumGlobalModules() const { return GlobalModules.size(); }
+ void addGlobalModule(ze_module_handle_t Module) {
+ GlobalModules.push_back(Module);
+ }
+ auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+ L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) {
+ for (auto &PGM : Programs)
+ if (PGM.getMemoryBuffer() == Image)
+ return &PGM;
+ return nullptr;
+ }
+
+ int32_t buildAllKernels() {
+ for (auto &PGM : Programs) {
+ int32_t RC = PGM.loadModuleKernels();
+ if (RC != OFFLOAD_SUCCESS)
+ return RC;
+ }
+ return OFFLOAD_SUCCESS;
+ }
+
+ // add a new program to the device. Return a reference to the new program
+ auto &addProgram(int32_t ImageId, std::unique_ptr<MemoryBuffer> &&Image) {
+ Programs.emplace_back(ImageId, *this, std::move(Image));
+ return Programs.back();
+ }
+
+ const auto &getLastProgram() const { return Programs.back(); }
+ auto &getLastProgram() { return Programs.back(); }
+ // Device properties getters
+ auto getVendorId() const { return DeviceProperties.vendorId; }
+ bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+ auto getPCIId() const { return DeviceProperties.deviceId; }
+ auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+ auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+ auto getNumEUsPerSubslice() const {
+ return DeviceProperties.numEUsPerSubslice;
+ }
+ auto getNumSubslicesPerSlice() const {
+ return DeviceProperties.numSubslicesPerSlice;
+ }
+ auto getNumSlices() const { return DeviceProperties.numSlices; }
+ auto getNumSubslices() const {
+ return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+ }
+ uint32_t getNumEUs() const {
+ return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+ }
+ auto getTotalThreads() const {
+ return DeviceProperties.numThreadsPerEU * getNumEUs();
+ }
+ auto getNumThreadsPerSubslice() const {
+ return getNumEUsPerSubslice() * getNumThreadsPerEU();
+ }
+ auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+ auto getMaxSharedLocalMemory() const {
+ return ComputeProperties.maxSharedLocalMemory;
+ }
+ auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+ auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+ auto getCacheSize() const { return CacheProperties.cacheSize; }
+ auto getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
+
+ int32_t getAllocKind() const { return AllocKind; }
+ DeviceArchTy getDeviceArch() const { return DeviceArch; }
+ bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+ static bool isDiscrete(uint32_t PCIId) {
+ switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+ case PCIIdTy::BMG:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ static bool isDiscrete(ze_device_handle_t Device) {
+ ze_device_properties_t PR{};
+ PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ PR.pNext = nullptr;
+ CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+ return isDiscrete(PR.deviceId);
+ }
+
+ bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+ bool isDeviceIPorNewer(uint32_t Version) const;
+
+ const std::string_view getUuid() const { return DeviceUuid; }
+
+ uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+ uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+ bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+ uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+ uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+ uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+ bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+ bool deviceRequiresImmCmdList() const {
+ return isDeviceIPorNewer(0x05004000);
+ }
+ bool asyncEnabled() const { return IsAsyncEnabled; }
+ bool useImmForCompute() const { return true; }
+ bool useImmForCopy() const { return true; }
+ bool useImmForInterop() const { return true; }
+ bool forceInorderInterop() const { return true; }
+
+ void reportDeviceInfo() const;
+
+ // Command queues related functions
+ /// Create a command list with given ordinal and flags
+ ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal,
+ ze_command_list_flags_t Flags,
+ const std::string_view DeviceIdStr);
+
+ /// Create a command list with default flags
+ ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal,
+ const std::string_view DeviceIdStr);
+
+ ze_command_list_handle_t getCmdList();
+
+ /// Create a command queue with given ordinal and flags
+ ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal, uint32_t Index,
+ ze_command_queue_flags_t Flags,
+ const std::string_view DeviceIdStr);
+
+ /// Create a command queue with default flags
+ ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal, uint32_t Index,
+ const std::string_view DeviceIdStr,
+ bool InOrder = false);
+
+ /// Create a new command queue for the given OpenMP device ID
+ ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+ /// Create an immediate command list
+ ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+ bool InOrder = false);
+
+ /// Create an immediate command list for computing
+ ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+ return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+ }
+
+ /// Create an immediate command list for copying
+ ze_command_list_handle_t createImmCopyCmdList();
+ ze_command_queue_handle_t getCmdQueue();
+ ze_command_list_handle_t getCopyCmdList();
+ ze_command_queue_handle_t getCopyCmdQueue();
+ ze_command_list_handle_t getLinkCopyCmdList();
+ ze_command_queue_handle_t getLinkCopyCmdQueue();
+ ze_command_list_handle_t getImmCmdList();
+ ze_command_list_handle_t getImmCopyCmdList();
+
+ /// Enqueue copy command
+ int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+ __tgt_async_info *AsyncInfo = nullptr,
+ bool Locked = false, bool UseCopyEngine = true);
----------------
adurang wrote:
I removed it
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list