[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alex Duran via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 21 05:54:39 PDT 2025
================
@@ -0,0 +1,579 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind) \
+ (Kind == TARGET_ALLOC_HOST \
+ ? "host memory" \
+ : (Kind == TARGET_ALLOC_SHARED \
+ ? "shared memory" \
+ : (Kind == TARGET_ALLOC_DEVICE ? "device memory" \
+ : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+ /// Base address memory is allocated from
+ uintptr_t AllocBase = 0;
+ /// Minimal size served by the current heap
+ size_t BlockSize = 0;
+ /// Max size served by the current heap
+ size_t MaxSize = 0;
+ /// Available memory blocks
+ uint32_t NumBlocks = 0;
+ /// Number of block descriptors
+ uint32_t NumBlockDesc = 0;
+ /// Number of block counters
+ uint32_t NumBlockCounter = 0;
+ /// List of memory block descriptors
+ uint64_t *BlockDesc = nullptr;
+ /// List of memory block counters
+ uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+ /// Location of device memory blocks
+ void *PoolBase = nullptr;
+ /// Heap size common to all heaps
+ size_t HeapSize = 0;
+ /// Number of heaps available
+ uint32_t NumHeaps = 0;
+ /// Heap descriptors (using fixed-size array to simplify memory allocation)
+ DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+ /// Base address allocated from compute runtime
+ void *Base = nullptr;
+ /// Allocation size known to users/libomptarget
+ size_t Size = 0;
+ /// TARGET_ALLOC kind
+ int32_t Kind = TARGET_ALLOC_DEFAULT;
+ /// Allocation from pool?
+ bool InPool = false;
+ /// Is implicit argument
+ bool ImplicitArg = false;
+
+ MemAllocInfoTy() = default;
+
+ MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+ bool _ImplicitArg)
+ : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+ ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+ /// Simple memory allocation statistics. Maintains numbers for pool allocation
+ /// and GPU RT allocation.
+ struct MemStatTy {
+ size_t Requested[2] = {0, 0}; // Requested bytes
+ size_t Allocated[2] = {0, 0}; // Allocated bytes
+ size_t Freed[2] = {0, 0}; // Freed bytes
+ size_t InUse[2] = {0, 0}; // Current memory in use
+ size_t PeakUse[2] = {0, 0}; // Peak bytes used
+ size_t NumAllocs[2] = {0, 0}; // Number of allocations
+ MemStatTy() = default;
+ };
+
+ /// Memory pool which enables reuse of already allocated blocks
+ /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+ /// memory.
+ /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+ /// -- Each memory block can allocate multiple fixed-size memory requested by
+ /// offload RT or user.
+ /// -- Memory allocation falls back to GPU RT allocation when the pool size
+ /// (total memory used by pool) reaches a threshold.
+ class MemPoolTy {
+
+ /// Memory block maintained in each bucket
+ struct BlockTy {
+ /// Base address of this block
+ uintptr_t Base = 0;
+ /// Size of the block
+ size_t Size = 0;
+ /// Supported allocation size by this block
+ size_t ChunkSize = 0;
+ /// Total number of slots
+ uint32_t NumSlots = 0;
+ /// Number of slots in use
+ uint32_t NumUsedSlots = 0;
+ /// Cached available slot returned by the last dealloc() call
+ uint32_t FreeSlot = UINT32_MAX;
+ /// Marker for the currently used slots
+ std::vector<bool> UsedSlots;
+
+ BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+ Base = reinterpret_cast<uintptr_t>(_Base);
+ Size = _Size;
+ ChunkSize = _ChunkSize;
+ NumSlots = Size / ChunkSize;
+ NumUsedSlots = 0;
+ UsedSlots.resize(NumSlots, false);
+ }
+
+ /// Check if the current block is fully used
+ bool isFull() const { return NumUsedSlots == NumSlots; }
+
+ /// Check if the given address belongs to the current block
+ bool contains(void *Mem) const {
+ auto M = reinterpret_cast<uintptr_t>(Mem);
+ return M >= Base && M < Base + Size;
+ }
+
+ /// Allocate a single chunk from the block
+ void *alloc();
+
+ /// Deallocate the given memory
+ void dealloc(void *Mem);
+ }; // BlockTy
+
+ /// Allocation kind for the current pool
+ int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+ /// Access to the allocator
+ MemAllocatorTy *Allocator = nullptr;
+ /// Minimum supported memory allocation size from pool
+ size_t AllocMin = 1 << 6; // 64B
+ /// Maximum supported memory allocation size from pool
+ size_t AllocMax = 0;
+ /// Allocation size when the pool needs to allocate a block
+ size_t AllocUnit = 1 << 16; // 64KB
+ /// Capacity of each block in the buckets which decides number of
+ /// allocatable chunks from the block. Each block in the bucket can serve
+ /// at least BlockCapacity chunks.
+ /// If ChunkSize * BlockCapacity <= AllocUnit
+ /// BlockSize = AllocUnit
+ /// Otherwise,
+ /// BlockSize = ChunkSize * BlockCapacity
+ /// This simply means how much memory is over-allocated.
+ uint32_t BlockCapacity = 0;
+ /// Total memory allocated from GPU RT for this pool
+ size_t PoolSize = 0;
+ /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+ /// when PoolSize reaches PoolSizeMax.
+ size_t PoolSizeMax = 0;
+ /// Small allocation size allowed in the pool even if pool size is over the
+ /// pool size limit
+ size_t SmallAllocMax = 1024;
+ /// Small allocation pool size
+ size_t SmallPoolSize = 0;
+ /// Small allocation pool size max (4MB)
+ size_t SmallPoolSizeMax = (4 << 20);
+ /// List of buckets
+ std::vector<std::vector<BlockTy *>> Buckets;
+ /// List of bucket parameters
+ std::vector<std::pair<size_t, size_t>> BucketParams;
+ /// Map from allocated pointer to corresponding block.
+ llvm::DenseMap<void *, BlockTy *> PtrToBlock;
+ /// Simple stats counting miss/hit in each bucket.
+ std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+ /// Need to zero-initialize after L0 allocation
+ bool ZeroInit = false;
+
+ /// Get bucket ID from the specified allocation size.
+ uint32_t getBucketId(size_t Size) {
+ uint32_t Count = 0;
+ for (size_t SZ = AllocMin; SZ < Size; Count++)
+ SZ <<= 1;
+ return Count;
+ }
+
+ public:
+ MemPoolTy() = default;
+
+ /// Construct pool with allocation kind, allocator, and user options.
+ MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option);
+ // Used for reduction pool
+ MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+ // Used for small memory pool with fixed parameters
+ MemPoolTy(MemAllocatorTy *_Allocator);
+
+ MemPoolTy(const MemPoolTy &) = delete;
+ MemPoolTy(MemPoolTy &&) = delete;
+ MemPoolTy &operator=(const MemPoolTy &) = delete;
+ MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+ void printUsage();
+ /// Release resources used in the pool.
+ ~MemPoolTy();
+
+ /// Allocate the requested size of memory from this pool.
+ /// AllocSize is the chunk size internally used for the returned memory.
+ void *alloc(size_t Size, size_t &AllocSize);
+ /// Deallocate the specified memory and returns block size deallocated.
+ size_t dealloc(void *Ptr);
+ }; // MemPoolTy
+
+ /// Allocation information maintained in the plugin
+ class MemAllocInfoMapTy {
+ /// Map from allocated pointer to allocation information
+ std::map<void *, MemAllocInfoTy> Map;
----------------
adurang wrote:
Yes, we need the pointers to be ordered to be able to properly check for ranges.
https://github.com/llvm/llvm-project/pull/158900
More information about the llvm-commits
mailing list