[llvm] [Offload][NFC] Reorganize `utils::` and make Device/Host/Shared clearer (PR #100280)
Johannes Doerfert via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 23 16:39:10 PDT 2024
https://github.com/jdoerfert created https://github.com/llvm/llvm-project/pull/100280
We had three `utils::` namespaces, all with different "meaning" (host, device, hsa_utils). We should, when we can, keep "include/Shared" accessible from host and device, thus RefCountTy has been moved to a separate header. `hsa_utils` was introduced to make `utils::` less overloaded. And common functionality was de-duplicated, e.g., `utils::advance` and `utils::advanceVoidPtr` -> `utils:advancePtr`. Type punning now checks for the size of the result to make sure it matches the source type.
No functional change was intended.
>From 1733bcd1b541808cfad2ffd7f569ec997033a6e8 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 23 Jul 2024 15:59:22 -0700
Subject: [PATCH] [Offload][NFC] Reorganize `utils::` and make
Device/Host/Shared clearer
We had three `utils::` namespaces, all with different "meaning" (host,
device, hsa_utils). We should, when we can, keep "include/Shared"
accessible from host and device, thus RefCountTy has been moved to a
separate header. `hsa_utils` was introduced to make `utils::` less
overloaded. And common functionality was de-duplicated, e.g.,
`utils::advance` and `utils::advanceVoidPtr` -> `utils:advancePtr`.
Type punning now checks for the size of the result to make sure it
matches the source type.
No functional change was intended.
---
offload/DeviceRTL/CMakeLists.txt | 6 +-
offload/DeviceRTL/include/Allocator.h | 2 +-
offload/DeviceRTL/include/Configuration.h | 2 +-
.../include/{Types.h => DeviceTypes.h} | 2 +-
offload/DeviceRTL/include/DeviceUtils.h | 54 ++++++++++
offload/DeviceRTL/include/Interface.h | 2 +-
offload/DeviceRTL/include/LibC.h | 2 +-
offload/DeviceRTL/include/Mapping.h | 2 +-
offload/DeviceRTL/include/State.h | 4 +-
offload/DeviceRTL/include/Synchronization.h | 2 +-
offload/DeviceRTL/include/Utils.h | 100 ------------------
offload/DeviceRTL/src/Allocator.cpp | 4 +-
offload/DeviceRTL/src/Configuration.cpp | 2 +-
offload/DeviceRTL/src/Debug.cpp | 2 +-
.../src/{Utils.cpp => DeviceUtils.cpp} | 16 +--
offload/DeviceRTL/src/Kernel.cpp | 2 +-
offload/DeviceRTL/src/Mapping.cpp | 4 +-
offload/DeviceRTL/src/Misc.cpp | 2 +-
offload/DeviceRTL/src/Parallelism.cpp | 4 +-
offload/DeviceRTL/src/Reduction.cpp | 4 +-
offload/DeviceRTL/src/State.cpp | 16 +--
offload/DeviceRTL/src/Synchronization.cpp | 4 +-
offload/DeviceRTL/src/Tasking.cpp | 6 +-
offload/DeviceRTL/src/Workshare.cpp | 6 +-
offload/include/Shared/RefCnt.h | 56 ++++++++++
offload/include/Shared/Types.h | 22 ++++
offload/include/Shared/Utils.h | 100 ++++++++----------
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 100 +++++++++---------
.../amdgpu/utils/UtilitiesRTL.h | 4 +-
.../common/include/PluginInterface.h | 16 +--
.../common/src/GlobalHandler.cpp | 4 +-
offload/plugins-nextgen/common/src/JIT.cpp | 4 +-
.../common/src/PluginInterface.cpp | 26 ++---
offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +-
offload/src/DeviceImage.cpp | 5 +-
offload/src/omptarget.cpp | 4 +-
36 files changed, 310 insertions(+), 283 deletions(-)
rename offload/DeviceRTL/include/{Types.h => DeviceTypes.h} (98%)
create mode 100644 offload/DeviceRTL/include/DeviceUtils.h
delete mode 100644 offload/DeviceRTL/include/Utils.h
rename offload/DeviceRTL/src/{Utils.cpp => DeviceUtils.cpp} (90%)
create mode 100644 offload/include/Shared/RefCnt.h
create mode 100644 offload/include/Shared/Types.h
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 7818c8d752599..5bd54c32a37bd 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -79,8 +79,8 @@ set(include_files
${include_directory}/Mapping.h
${include_directory}/State.h
${include_directory}/Synchronization.h
- ${include_directory}/Types.h
- ${include_directory}/Utils.h
+ ${include_directory}/DeviceTypes.h
+ ${include_directory}/DeviceUtils.h
${include_directory}/Workshare.h
)
@@ -97,7 +97,7 @@ set(src_files
${source_directory}/State.cpp
${source_directory}/Synchronization.cpp
${source_directory}/Tasking.cpp
- ${source_directory}/Utils.cpp
+ ${source_directory}/DeviceUtils.cpp
${source_directory}/Workshare.cpp
)
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index a28eb0fb2977e..6bb1cafac720f 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -12,7 +12,7 @@
#ifndef OMPTARGET_ALLOCATOR_H
#define OMPTARGET_ALLOCATOR_H
-#include "Types.h"
+#include "DeviceTypes.h"
// Forward declaration.
struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644
--- a/offload/DeviceRTL/include/Configuration.h
+++ b/offload/DeviceRTL/include/Configuration.h
@@ -15,7 +15,7 @@
#include "Shared/Environment.h"
-#include "Types.h"
+#include "DeviceTypes.h"
namespace ompx {
namespace config {
diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h
similarity index 98%
rename from offload/DeviceRTL/include/Types.h
rename to offload/DeviceRTL/include/DeviceTypes.h
index 2e12d9da0353b..2a594de0befc2 100644
--- a/offload/DeviceRTL/include/Types.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -1,4 +1,4 @@
-//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//===---------- DeviceTypes.h - OpenMP types ---------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
new file mode 100644
index 0000000000000..378d1fb2d65e3
--- /dev/null
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -0,0 +1,54 @@
+//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+
+#include "Shared/Utils.h"
+#include "DeviceTypes.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
+
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
+bool isSharedMemPtr(void *Ptr);
+
+/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
+bool isThreadLocalMemPtr(void *Ptr);
+
+/// A pointer variable that has by design an `undef` value. Use with care.
+[[clang::loader_uninitialized]] static void *const UndefPtr;
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+
+#pragma omp end declare target
+
+#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index d36d4227091ef..c4bfaaa2404b4 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -14,7 +14,7 @@
#include "Shared/Environment.h"
-#include "Types.h"
+#include "DeviceTypes.h"
/// External API
///
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index dde86af783af9..6e02b4aca462a 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -12,7 +12,7 @@
#ifndef OMPTARGET_LIBC_H
#define OMPTARGET_LIBC_H
-#include "Types.h"
+#include "DeviceTypes.h"
extern "C" {
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb9..2fb87abe5418c 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -12,7 +12,7 @@
#ifndef OMPTARGET_MAPPING_H
#define OMPTARGET_MAPPING_H
-#include "Types.h"
+#include "DeviceTypes.h"
namespace ompx {
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 1a3490394458f..37699529e726f 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -16,8 +16,8 @@
#include "Debug.h"
#include "Mapping.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
// Forward declaration.
struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index af9e1a673e6a2..874974cc861df 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -12,7 +12,7 @@
#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-#include "Types.h"
+#include "DeviceTypes.h"
namespace ompx {
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
deleted file mode 100644
index 82e2397b5958b..0000000000000
--- a/offload/DeviceRTL/include/Utils.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_UTILS_H
-#define OMPTARGET_DEVICERTL_UTILS_H
-
-#include "Types.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace utils {
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
- return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-/// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
- return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
- static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
- return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
- static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
- return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
- static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
- return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
- static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
- return __builtin_popcountl(V);
-}
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_up(Ty1 V, Ty2 Align) {
- return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align);
-}
-/// Return \p V aligned "downwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
- return V - V % Align;
-}
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return \p V typed punned as \p DstTy.
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
- return *((DstTy *)(&V));
-}
-
-/// A pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index c9c940de62c1a..2a85a34d32f6e 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -14,8 +14,8 @@
#include "Configuration.h"
#include "Mapping.h"
#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
using namespace ompx;
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index ef0c3663536f5..4d97ad67313aa 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -13,7 +13,7 @@
#include "Configuration.h"
#include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
using namespace ompx;
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 4e16591cc6c51..af26a26c13109 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -17,7 +17,7 @@
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
using namespace ompx;
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
similarity index 90%
rename from offload/DeviceRTL/src/Utils.cpp
rename to offload/DeviceRTL/src/DeviceUtils.cpp
index 53cc803234867..c204a7be73b1f 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -9,7 +9,7 @@
//
//===----------------------------------------------------------------------===//
-#include "Utils.h"
+#include "DeviceUtils.h"
#include "Debug.h"
#include "Interface.h"
@@ -33,7 +33,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
@@ -44,8 +44,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
- int Width = mapping::getWarpSize();
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
@@ -77,8 +76,8 @@ bool isSharedMemPtr(const void *Ptr) {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
- return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
+ return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
}
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
@@ -104,8 +103,9 @@ void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
impl::Unpack(Val, &LowBits, &HighBits);
}
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
- return impl::shuffle(Mask, Var, SrcLane);
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width) {
+ return impl::shuffle(Mask, Var, SrcLane, Width);
}
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index e70704f25e922..c3b4554574786 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -18,7 +18,7 @@
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
-#include "Types.h"
+#include "DeviceTypes.h"
#include "Workshare.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index c1ce878746a69..8287312c74e4e 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -12,8 +12,8 @@
#include "Mapping.h"
#include "Interface.h"
#include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
#pragma omp begin declare target device_type(nohost)
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index c24af9442d16e..ca8b549b28dbf 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -10,7 +10,7 @@
//===----------------------------------------------------------------------===//
#include "Configuration.h"
-#include "Types.h"
+#include "DeviceTypes.h"
#include "Debug.h"
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 15b991f202539..e3fcfef9b22aa 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -37,8 +37,8 @@
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
using namespace ompx;
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c8..f4e2e0d25bde9 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -15,8 +15,8 @@
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
using namespace ompx;
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index a1e4fa2449d9a..3ba4b5c65c43a 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -13,13 +13,13 @@
#include "Allocator.h"
#include "Configuration.h"
#include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
#include "Interface.h"
#include "LibC.h"
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
using namespace ompx;
@@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy {
/// Deallocate the last allocation made by the encountering thread and pointed
/// to by \p Ptr from the stack. Each thread can call this function.
- void pop(void *Ptr, uint32_t Bytes);
+ void pop(void *Ptr, uint64_t Bytes);
private:
/// Compute the size of the storage space reserved for a thread.
uint32_t computeThreadStorageTotal() {
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
- return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
- allocator::ALIGNMENT);
+ return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
+ allocator::ALIGNMENT);
}
/// Return the top address of the warp data stack, that is the first address
@@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
// First align the number of requested bytes.
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
/// be passed in as an argument and the stack rewritten to support it.
- uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+ uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
uint32_t StorageTotal = computeThreadStorageTotal();
@@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
return GlobalMemory;
}
-void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
- uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
+ uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
if (utils::isSharedMemPtr(Ptr)) {
int TId = mapping::getThreadIdInBlock();
Usage[TId] -= AlignedBytes;
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcd..97a6b080169ad 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -16,8 +16,8 @@
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
#pragma omp begin declare target device_type(nohost)
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 2dc33562e6d79..23a967c1a337e 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -13,10 +13,10 @@
//
//===----------------------------------------------------------------------===//
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
#include "Interface.h"
#include "State.h"
-#include "Types.h"
-#include "Utils.h"
using namespace ompx;
@@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
TaskSizeTotal, "explicit task descriptor");
TaskDescriptor->Payload =
- utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+ utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
TaskDescriptor->TaskFn = TaskFn;
return TaskDescriptor;
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index 7e087a07e4420..ad60e66548be9 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -14,12 +14,12 @@
#include "Workshare.h"
#include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
using namespace ompx;
@@ -349,7 +349,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
if (rank == 0) {
warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
}
- warp_res = utils::shuffle(active, warp_res, leader);
+ warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
return warp_res + rank;
}
diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h
new file mode 100644
index 0000000000000..7c615ba167a3d
--- /dev/null
+++ b/offload/include/Shared/RefCnt.h
@@ -0,0 +1,56 @@
+//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_REF_CNT_H
+#define OMPTARGET_SHARED_REF_CNT_H
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <memory>
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class for thread-safe reference counting. Any class that needs
+/// objects' reference counting can inherit from this entity or have it as a
+/// class data member.
+template <typename Ty = uint32_t,
+ std::memory_order MemoryOrder = std::memory_order_relaxed>
+struct RefCountTy {
+ /// Create a refcount object initialized to zero.
+ RefCountTy() : Refs(0) {}
+
+ ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
+
+ /// Increase the reference count atomically.
+ void increase() { Refs.fetch_add(1, MemoryOrder); }
+
+ /// Decrease the reference count and return whether it became zero. Decreasing
+ /// the counter in more units than it was previously increased results in
+ /// undefined behavior.
+ bool decrease() {
+ Ty Prev = Refs.fetch_sub(1, MemoryOrder);
+ assert(Prev > 0 && "Invalid refcount");
+ return (Prev == 1);
+ }
+
+ Ty get() const { return Refs.load(MemoryOrder); }
+
+private:
+ /// The atomic reference counter.
+ std::atomic<Ty> Refs;
+};
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif
diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h
new file mode 100644
index 0000000000000..15e3cfefa37ed
--- /dev/null
+++ b/offload/include/Shared/Types.h
@@ -0,0 +1,22 @@
+//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_TYPES_H
+#define OMPTARGET_SHARED_TYPES_H
+
+#ifndef OMPTARGET_DEVICE_RUNTIME
+#include <cstdint>
+#else
+#include "DeviceTypes.h"
+#endif
+
+#endif // OMPTARGET_SHARED_TYPES_H
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index fce14b54edb98..da83551fffd54 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -6,83 +6,73 @@
//
//===----------------------------------------------------------------------===//
//
-// Routines and classes used to provide useful functionalities like string
-// parsing and environment variables.
+// Routines and classes used to provide useful functionalities for the host and
+// the device.
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_SHARED_UTILS_H
#define OMPTARGET_SHARED_UTILS_H
-#include "llvm/ADT/StringRef.h"
+#include "Types.h"
-#include "Debug.h"
-
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-/// Utility class for thread-safe reference counting. Any class that needs
-/// objects' reference counting can inherit from this entity or have it as a
-/// class data member.
-template <typename Ty = uint32_t,
- std::memory_order MemoryOrder = std::memory_order_relaxed>
-struct RefCountTy {
- /// Create a refcount object initialized to zero.
- RefCountTy() : Refs(0) {}
-
- ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
-
- /// Increase the reference count atomically.
- void increase() { Refs.fetch_add(1, MemoryOrder); }
-
- /// Decrease the reference count and return whether it became zero. Decreasing
- /// the counter in more units than it was previously increased results in
- /// undefined behavior.
- bool decrease() {
- Ty Prev = Refs.fetch_sub(1, MemoryOrder);
- assert(Prev > 0 && "Invalid refcount");
- return (Prev == 1);
- }
-
- Ty get() const { return Refs.load(MemoryOrder); }
-
-private:
- /// The atomic reference counter.
- std::atomic<Ty> Refs;
-};
+namespace utils {
/// Return the difference (in bytes) between \p Begin and \p End.
template <typename Ty = char>
-ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+auto getPtrDiff(const void *End, const void *Begin) {
return reinterpret_cast<const Ty *>(End) -
reinterpret_cast<const Ty *>(Begin);
}
/// Return \p Ptr advanced by \p Offset bytes.
-template <typename Ty> Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) {
- static_assert(std::is_void<Ty>::value);
- return const_cast<char *>(reinterpret_cast<const char *>(Ptr) + Offset);
+template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
+ return (Ty1 *)(const_cast<char *>((const char *)(Ptr)) + Offset);
}
-/// Return \p Ptr aligned to \p Alignment bytes.
-template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
- size_t Space = std::numeric_limits<size_t>::max();
- return std::align(Alignment, sizeof(char), Ptr, Space);
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignPtr(Ty1 V, Ty2 Align) {
+ return reinterpret_cast<Ty1>(((uintptr_t(V) + Align - 1) / Align) * Align);
+}
+/// Return \p V aligned "downwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignDown(Ty1 V, Ty2 Align) {
+ return V - V % Align;
}
/// Round up \p V to a \p Boundary.
template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
- return (V + Boundary - 1) / Boundary * Boundary;
+ return alignPtr(V, Boundary);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+ static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+ return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+ static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+ return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+ static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+ return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+ static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+ return __builtin_popcountl(V);
+}
+
+template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
+ static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+ return *((DstTy *)(&V));
}
-} // namespace target
-} // namespace omp
-} // namespace llvm
+} // namespace utils
#endif // OMPTARGET_SHARED_UTILS_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e6643d3260eb4..9196e925d4753 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -23,6 +23,7 @@
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
+#include "Shared/RefCnt.h"
#include "Shared/Utils.h"
#include "Utils/ELF.h"
@@ -87,7 +88,7 @@ struct AMDGPUDeviceImageTy;
struct AMDGPUMemoryManagerTy;
struct AMDGPUMemoryPoolTy;
-namespace utils {
+namespace hsa_utils {
/// Iterate elements using an HSA iterate function. Do not use this function
/// directly but the specialized ones below instead.
@@ -187,7 +188,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
std::string Target;
- auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+ auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
uint32_t Length;
hsa_status_t Status;
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
@@ -208,7 +209,7 @@ Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
return Err;
return Target;
}
-} // namespace utils
+} // namespace hsa_utils
/// Utility class representing generic resource references to AMDGPU resources.
template <typename ResourceTy>
@@ -481,7 +482,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
/// Get additional info for kernel, e.g., register spill counts
- std::optional<utils::KernelMetaDataTy>
+ std::optional<hsa_utils::KernelMetaDataTy>
getKernelInfo(StringRef Identifier) const {
auto It = KernelInfoMap.find(Identifier);
@@ -495,7 +496,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
/// The exectuable loaded on the agent.
hsa_executable_t Executable;
hsa_code_object_t CodeObject;
- StringMap<utils::KernelMetaDataTy> KernelInfoMap;
+ StringMap<hsa_utils::KernelMetaDataTy> KernelInfoMap;
uint16_t ELFABIVersion;
};
@@ -545,7 +546,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
// TODO: Read the kernel descriptor for the max threads per block. May be
// read from the image.
- ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+ ImplicitArgsSize =
+ hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
// Get additional kernel info read from image
@@ -596,7 +598,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint32_t ImplicitArgsSize;
/// Additional Info for the AMD GPU Kernel
- std::optional<utils::KernelMetaDataTy> KernelInfo;
+ std::optional<hsa_utils::KernelMetaDataTy> KernelInfo;
};
/// Class representing an HSA signal. Signals are used to define dependencies
@@ -1268,13 +1270,14 @@ struct AMDGPUStreamTy {
// Issue the async memory copy.
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
- CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+ Agent, CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
}
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
- CopySize, 0, nullptr, OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+ Agent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1308,14 +1311,14 @@ struct AMDGPUStreamTy {
// dependency if already satisfied.
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- if (auto Err = utils::asyncMemCopy(
+ if (auto Err = hsa_utils::asyncMemCopy(
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
&InputSignalRaw, OutputSignals[0]->get()))
return Err;
} else {
- if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
- Src, Agent, CopySize, 0, nullptr,
- OutputSignals[0]->get()))
+ if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
+ Agent, Src, Agent, CopySize, 0,
+ nullptr, OutputSignals[0]->get()))
return Err;
}
@@ -1406,12 +1409,13 @@ struct AMDGPUStreamTy {
// dependency if already satisfied.
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+ Agent, CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
}
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
- CopySize, 0, nullptr, OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+ Agent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1435,13 +1439,13 @@ struct AMDGPUStreamTy {
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
- SrcAgent, CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+ SrcAgent, CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
}
- return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
- SrcAgent, CopySize, 0, nullptr,
- OutputSignal->get());
+ return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+ SrcAgent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
/// Synchronize with the stream. The current thread waits until all operations
@@ -1799,7 +1803,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
Error retrieveAllMemoryPools() override {
// Iterate through the available pools across the host agents.
for (hsa_agent_t Agent : Agents) {
- Error Err = utils::iterateAgentMemoryPools(
+ Error Err = hsa_utils::iterateAgentMemoryPools(
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
AMDGPUMemoryPoolTy *MemoryPool =
new AMDGPUMemoryPoolTy(HSAMemoryPool);
@@ -1964,7 +1968,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
// Detect if XNACK is enabled
auto TargeTripleAndFeaturesOrError =
- utils::getTargetTripleAndFeatures(Agent);
+ hsa_utils::getTargetTripleAndFeatures(Agent);
if (!TargeTripleAndFeaturesOrError)
return TargeTripleAndFeaturesOrError.takeError();
if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
@@ -2316,9 +2320,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
- Agent, PinnedPtr, Agent, Size, 0,
- nullptr, Signal.get()))
+ if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
+ Agent, PinnedPtr, Agent, Size, 0,
+ nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2376,9 +2380,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
- Agent, TgtPtr, Agent, Size, 0, nullptr,
- Signal.get()))
+ if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
+ PinnedPtr, Agent, TgtPtr, Agent,
+ Size, 0, nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2420,7 +2424,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- if (auto Err = utils::asyncMemCopy(
+ if (auto Err = hsa_utils::asyncMemCopy(
useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
return Err;
@@ -2686,7 +2690,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
}
Info.add("ISAs");
- auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+ auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel2>("Name", TmpChar);
@@ -2768,7 +2772,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Retrieve and construct all memory pools of the device agent.
Error retrieveAllMemoryPools() override {
// Iterate through the available pools of the device agent.
- return utils::iterateAgentMemoryPools(
+ return hsa_utils::iterateAgentMemoryPools(
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
AMDGPUMemoryPoolTy *MemoryPool =
Plugin.allocate<AMDGPUMemoryPoolTy>();
@@ -2954,7 +2958,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
if (Result)
return Plugin::error("Loaded HSA executable does not validate");
- if (auto Err = utils::readAMDGPUMetaDataFromImage(
+ if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
return Err;
@@ -3083,7 +3087,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
llvm::SmallVector<hsa_agent_t> HostAgents;
// Count the number of available agents.
- auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
+ auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
// Get the device type of the agent.
hsa_device_type_t DeviceType;
hsa_status_t Status =
@@ -3178,12 +3182,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
return false;
auto TargeTripleAndFeaturesOrError =
- utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
+ hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
if (!TargeTripleAndFeaturesOrError)
return TargeTripleAndFeaturesOrError.takeError();
- return utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
- ElfOrErr->getPlatformFlags(),
- *TargeTripleAndFeaturesOrError);
+ return hsa_utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
+ ElfOrErr->getPlatformFlags(),
+ *TargeTripleAndFeaturesOrError);
}
bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
@@ -3295,11 +3299,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
- utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+ hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
// Initialize implicit arguments.
- ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
- advanceVoidPtr(AllArgs, LaunchParams.Size));
+ ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+ utils::advancePtr(AllArgs, LaunchParams.Size));
// Initialize the implicit arguments to zero.
std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3323,7 +3327,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (ImplArgs &&
- getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
+ getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
ImplArgs->BlockCountY = 1;
ImplArgs->BlockCountZ = 1;
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 58a3b5df00fac..1e99d0a30bdf2 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -33,7 +33,7 @@ namespace llvm {
namespace omp {
namespace target {
namespace plugin {
-namespace utils {
+namespace hsa_utils {
// The implicit arguments of COV5 AMDGPU kernels.
struct AMDGPUImplicitArgsTy {
@@ -310,7 +310,7 @@ readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
return Error::success();
}
-} // namespace utils
+} // namespace hsa_utils
} // namespace plugin
} // namespace target
} // namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 973add0ba1000..0f156032b63d1 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -231,7 +231,7 @@ class DeviceImageTy {
/// Get the image size.
size_t getSize() const {
- return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
+ return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
}
/// Get a memory buffer reference to the whole image.
@@ -471,7 +471,7 @@ class PinnedAllocationMapTy {
--It;
// The buffer is not contained in the pinned allocation.
- if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
+ if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
return &(*It);
// None found.
@@ -498,15 +498,15 @@ class PinnedAllocationMapTy {
/// Indicate whether the first range A fully contains the second range B.
static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
- void *EndA = advanceVoidPtr(PtrA, SizeA);
- void *EndB = advanceVoidPtr(PtrB, SizeB);
+ void *EndA = utils::advancePtr(PtrA, SizeA);
+ void *EndB = utils::advancePtr(PtrB, SizeB);
return (PtrB >= PtrA && EndB <= EndA);
}
/// Indicate whether the first range A intersects with the second range B.
static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
- void *EndA = advanceVoidPtr(PtrA, SizeA);
- void *EndB = advanceVoidPtr(PtrB, SizeB);
+ void *EndA = utils::advancePtr(PtrA, SizeA);
+ void *EndB = utils::advancePtr(PtrB, SizeB);
return (PtrA < EndB && PtrB < EndA);
}
@@ -588,8 +588,8 @@ class PinnedAllocationMapTy {
if (!Entry)
return nullptr;
- return advanceVoidPtr(Entry->DevAccessiblePtr,
- getPtrDiff(HstPtr, Entry->HstPtr));
+ return utils::advancePtr(Entry->DevAccessiblePtr,
+ utils::getPtrDiff(HstPtr, Entry->HstPtr));
}
/// Check whether a buffer belongs to a registered host pinned allocation.
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index ba0aa47f8e51c..e18f358af116c 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -152,8 +152,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
HostGlobal.getPtr());
assert(Image.getStart() <= ImageGlobal.getPtr() &&
- advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
- advanceVoidPtr(Image.getStart(), Image.getSize()) &&
+ utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
+ utils::advancePtr(Image.getStart(), Image.getSize()) &&
"Attempting to read outside the image!");
// Perform the copy from the image to the host memory.
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9dbba1459839d..9adb62b677b92 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -51,7 +51,7 @@ namespace {
bool isImageBitcode(const __tgt_device_image &Image) {
StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
- target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+ utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
return identify_magic(Binary) == file_magic::bitcode;
}
@@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
Expected<std::unique_ptr<Module>>
createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
StringRef Data((const char *)Image.ImageStart,
- target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+ utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
return createModuleFromMemoryBuffer(MB, Context);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 118265973f327..69c7a1cdded94 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -16,6 +16,7 @@
#include "GlobalHandler.h"
#include "JIT.h"
+#include "Shared/Utils.h"
#include "Utils/ELF.h"
#include "omptarget.h"
@@ -74,7 +75,7 @@ struct RecordReplayTy {
Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
Device->free(Addr);
// Align Address to MaxMemoryAllocation
- Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+ Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
return Addr;
}
@@ -207,8 +208,8 @@ struct RecordReplayTy {
if (EC)
report_fatal_error("Error saving image : " + StringRef(EC.message()));
if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
- size_t Size =
- getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
+ size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
+ TgtImageBitcode->ImageStart);
MemoryBufferRef MBR = MemoryBufferRef(
StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
OS << MBR.getBuffer();
@@ -241,10 +242,10 @@ struct RecordReplayTy {
int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
memcpy(BufferPtr, OffloadEntry.Name, NameLength);
- BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
+ BufferPtr = utils::advancePtr(BufferPtr, NameLength);
*((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
- BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
+ BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
auto Err = Plugin::success();
{
@@ -254,11 +255,12 @@ struct RecordReplayTy {
}
if (Err)
report_fatal_error("Error retrieving data for global");
- BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
+ BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
}
assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
"Buffer over/under-filled.");
- assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+ assert(Size == utils::getPtrDiff(BufferPtr,
+ GlobalsMB->get()->getBufferStart()) &&
"Buffer size mismatch");
StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
@@ -906,7 +908,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
#ifdef OMPT_SUPPORT
if (ompt::Initialized) {
size_t Bytes =
- getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+ utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
performOmptCallback(
device_load, Plugin.getUserId(DeviceId),
/*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
@@ -1134,8 +1136,8 @@ Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
return std::move(Err);
// Return the device accessible pointer with the correct offset.
- return advanceVoidPtr(Entry->DevAccessiblePtr,
- getPtrDiff(HstPtr, Entry->HstPtr));
+ return utils::advancePtr(Entry->DevAccessiblePtr,
+ utils::getPtrDiff(HstPtr, Entry->HstPtr));
}
// No intersecting registered allocation found in the map. First, lock the
@@ -1610,7 +1612,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; }
int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
- target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+ utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
auto HandleError = [&](Error Err) -> bool {
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1642,7 +1644,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
__tgt_device_image *Image) {
StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
- target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+ utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
auto HandleError = [&](Error Err) -> bool {
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033..015c7775ba351 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -705,7 +705,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::error("Wrong device Page size");
// Ceil to page size.
- Size = roundUp(Size, Granularity);
+ Size = utils::roundUp(Size, Granularity);
// Create a handler of our allocation
CUmemGenericAllocationHandle AHandle;
diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp
index e42460b5cca4f..e5b4bf5526437 100644
--- a/offload/src/DeviceImage.cpp
+++ b/offload/src/DeviceImage.cpp
@@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc,
__tgt_device_image &TgtDeviceImage)
: BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) {
- llvm::StringRef ImageStr(
- static_cast<char *>(Image.ImageStart),
- llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+ llvm::StringRef ImageStr(static_cast<char *>(Image.ImageStart),
+ utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
auto BinaryOrErr =
llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, ""));
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca8529c5ee3..fe45c1fb9e73c 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -323,8 +323,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) {
for (auto &Image : PM->deviceImages()) {
const char *Start = reinterpret_cast<const char *>(
Image.getExecutableImage().ImageStart);
- uint64_t Length = llvm::omp::target::getPtrDiff(
- Start, Image.getExecutableImage().ImageEnd);
+ uint64_t Length =
+ utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
/*Identifier=*/"");
More information about the llvm-commits
mailing list