[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Piotr Balcer via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 7 06:11:10 PDT 2025
================
@@ -0,0 +1,645 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+ if (isFull())
+ return nullptr;
+ if (FreeSlot != UINT32_MAX) {
+ const uint32_t Slot = FreeSlot;
+ FreeSlot = UINT32_MAX;
+ UsedSlots[Slot] = true;
+ NumUsedSlots++;
+ return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+ }
+ for (uint32_t I = 0; I < NumSlots; I++) {
+ if (UsedSlots[I])
+ continue;
+ UsedSlots[I] = true;
+ NumUsedSlots++;
+ return reinterpret_cast<void *>(Base + I * ChunkSize);
+ }
+ // Should not reach here.
+ assert(0 && "Inconsistent memory pool state");
+ return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+ if (!contains(Mem))
+ assert(0 && "Inconsistent memory pool state");
+ const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+ UsedSlots[Slot] = false;
+ NumUsedSlots--;
+ FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option) {
+ AllocKind = Kind;
+ Allocator = _Allocator;
+
+ // Read user-defined options
+ const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+ const size_t UserAllocMax = UserOptions[0];
+ const size_t UserCapacity = UserOptions[1];
+ const size_t UserPoolSize = UserOptions[2];
+
+ BlockCapacity = UserCapacity;
+ PoolSizeMax = UserPoolSize << 20; // MB to B
+ PoolSize = 0;
+
+ auto Context = Allocator->L0Context->getZeContext();
+ const auto Device = Allocator->Device;
+
+ // Check page size used for this allocation kind to decide minimum
+ // allocation size when allocating from L0.
+ void *Mem = Allocator->allocL0(8, 0, AllocKind);
+ ze_memory_allocation_properties_t AP{
+ ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+ ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+ CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+ AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+ CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+ bool IsDiscrete = false;
+ if (Device) {
+ ze_device_properties_t Properties{};
+ Properties.deviceId = 0;
+ Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ Properties.pNext = nullptr;
+ CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+ IsDiscrete = Device->isDiscreteDevice();
+
+ if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+ // Use page size as minimum chunk size for USM shared on discrete
+ // device.
+ // FIXME: pageSize is not returned correctly (=0) on some new devices,
+ // so use fallback value for now.
+ AllocMin = (std::max)(AP.pageSize, AllocUnit);
+ AllocUnit = AllocMin * BlockCapacity;
+ }
+ }
+
+ // Convert MB to B and round up to power of 2
+ AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+ if (AllocMin >= AllocMax) {
+ AllocMax = 2 * AllocMin;
+ DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+ "requirements.\n",
+ AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+ }
+ assert(AllocMin < AllocMax &&
+ "Invalid parameters while initializing memory pool");
+ const auto MinSize = getBucketId(AllocMin);
+ const auto MaxSize = getBucketId(AllocMax);
+ Buckets.resize(MaxSize - MinSize + 1);
+ BucketStats.resize(Buckets.size(), {0, 0});
+
+ // Set bucket parameters
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const size_t ChunkSize = AllocMin << I;
+ size_t BlockSize = ChunkSize * BlockCapacity;
+ // On discrete device, the cost of native L0 invocation doubles when the
+ // the requested size doubles after certain threshold, so allocating
+ // larger block does not pay off at all. It is better to keep a single
+ // chunk in a single block in such cases.
+ if (BlockSize <= AllocUnit) {
+ BlockSize = AllocUnit; // Allocation unit is already large enough
+ } else if (IsDiscrete) {
+ // Do not preallocate if it does not pay off
+ if (ChunkSize >= L0UsmPreAllocThreshold ||
+ (AllocKind == TARGET_ALLOC_HOST &&
+ ChunkSize >= L0HostUsmPreAllocThreshold))
+ BlockSize = ChunkSize;
+ }
+ BucketParams.emplace_back(ChunkSize, BlockSize);
+ }
+
+ DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+ "AllocMax = %zu, "
+ "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+ ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+ BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option) {
+ AllocKind = TARGET_ALLOC_DEVICE;
+ Allocator = _Allocator;
+ AllocMin = AllocUnit = 1024 << 6; // 64KB
+ AllocMax = Option.ReductionPoolInfo[0] << 20;
+ BlockCapacity = Option.ReductionPoolInfo[1];
+ PoolSize = 0;
+ PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+ const auto MinSize = getBucketId(AllocMin);
+ const auto MaxSize = getBucketId(AllocMax);
+ Buckets.resize(MaxSize - MinSize + 1);
+ BucketStats.resize(Buckets.size(), {0, 0});
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const size_t ChunkSize = AllocMin << I;
+ BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+ }
+
+ DP("Initialized reduction scratch pool for device " DPxMOD
+ ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+ DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+ AllocKind = TARGET_ALLOC_DEVICE;
+ Allocator = _Allocator;
+ AllocMax = AllocMin;
+ BlockCapacity = AllocUnit / AllocMax;
+ PoolSize = 0;
+ PoolSizeMax = (1 << 20); // this should be sufficiently large
+ Buckets.resize(1);
+ BucketStats.resize(1, {0, 0});
+ BucketParams.emplace_back(AllocMax, AllocUnit);
+ ZeroInit = true;
+ ZeroInitValue.resize(AllocUnit, 0);
+ DP("Initialized zero-initialized reduction counter pool for "
+ "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+ DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+ auto PrintNum = [](uint64_t Num) {
+ if (Num > 1e9)
+ fprintf(stderr, "%11.2e", float(Num));
+ else
+ fprintf(stderr, "%11" PRIu64, Num);
+ };
+
+ bool HasPoolAlloc = false;
+ for (auto &Stat : BucketStats) {
+ if (Stat.first > 0 || Stat.second > 0) {
+ HasPoolAlloc = true;
+ break;
+ }
+ }
+
+ DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+ DPxPTR(Allocator->Device));
+
+ if (HasPoolAlloc) {
+ DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+ AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+ DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const auto &Stat = BucketStats[I];
+ if (Stat.first > 0 || Stat.second > 0) {
+ DP("-- Bucket[%10zu]:", BucketParams[I].first);
+ PrintNum(Stat.first);
+ PrintNum(Stat.second);
+ fprintf(stderr, "%11.2f\n",
+ float(Stat.second) / float(Stat.first + Stat.second) * 100);
+ }
+ }
+ } else {
+ DP("-- Not used\n");
+ }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+ const int DebugLevel = getDebugLevel();
+ if (DebugLevel > 0)
+ printUsage();
+ for (auto &Bucket : Buckets) {
+ for (auto *Block : Bucket) {
+ if (DebugLevel > 0)
+ Allocator->log(0, Block->Size, AllocKind);
+ CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+ reinterpret_cast<void *>(Block->Base));
+ delete Block;
+ }
+ }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+ if (Size == 0 || Size > AllocMax)
+ return nullptr;
+
+ const uint32_t BucketId = getBucketId(Size);
+ auto &Blocks = Buckets[BucketId];
+ void *Mem = nullptr;
+
+ for (auto *Block : Blocks) {
+ if (Block->isFull())
+ continue;
+ Mem = Block->alloc();
+ assert(Mem && "Inconsistent state while allocating memory from pool");
+ PtrToBlock.try_emplace(Mem, Block);
+ break;
+ }
+
+ if (Mem == nullptr) {
+ const bool IsSmallAllocatable =
+ (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+ const bool IsFull = (PoolSize > PoolSizeMax);
+ if (IsFull && !IsSmallAllocatable)
+ return nullptr;
+ // Bucket is empty or all blocks in the bucket are full
+ const auto ChunkSize = BucketParams[BucketId].first;
+ const auto BlockSize = BucketParams[BucketId].second;
+ void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+ if (ZeroInit) {
+ auto RC =
+ Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
----------------
pbalcer wrote:
nit: you could use https://oneapi-src.github.io/level-zero-spec/level-zero/1.11/core/api.html#zecommandlistappendmemoryfill
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list