[llvm] [OpenMP] Add explicit attributes to every function declaration (PR #122399)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 9 17:40:32 PST 2025
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/122399
Summary:
Instead of having the scoped attributes, add this to every function.
>From 19b46c821b5a1c3988c28022bdcbcb4e59fe15a3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 9 Jan 2025 18:45:09 -0600
Subject: [PATCH] [OpenMP] Add explicit attributes to every function
declaration
Summary:
Instead of having the scoped attributes, add this to every function.
---
offload/DeviceRTL/CMakeLists.txt | 2 +-
offload/DeviceRTL/include/Allocator.h | 11 +-
offload/DeviceRTL/include/Configuration.h | 26 +--
offload/DeviceRTL/include/Debug.h | 11 +-
offload/DeviceRTL/include/DeviceTypes.h | 6 +-
offload/DeviceRTL/include/DeviceUtils.h | 21 +-
offload/DeviceRTL/include/Interface.h | 230 ++++++++++----------
offload/DeviceRTL/include/LibC.h | 6 +-
offload/DeviceRTL/include/Mapping.h | 44 ++--
offload/DeviceRTL/include/Profiling.h | 8 +-
offload/DeviceRTL/include/State.h | 95 ++++----
offload/DeviceRTL/include/Synchronization.h | 53 ++---
offload/DeviceRTL/include/Workshare.h | 4 +-
offload/DeviceRTL/src/Allocator.cpp | 13 +-
offload/DeviceRTL/src/Configuration.cpp | 28 +--
offload/DeviceRTL/src/Debug.cpp | 12 +-
offload/DeviceRTL/src/DeviceUtils.cpp | 67 +++---
offload/DeviceRTL/src/Kernel.cpp | 13 +-
offload/DeviceRTL/src/LibC.cpp | 17 +-
offload/DeviceRTL/src/Mapping.cpp | 141 ++++++------
offload/DeviceRTL/src/Misc.cpp | 36 +--
offload/DeviceRTL/src/Parallelism.cpp | 38 ++--
offload/DeviceRTL/src/Profiling.cpp | 6 +-
offload/DeviceRTL/src/Reduction.cpp | 54 ++---
offload/DeviceRTL/src/State.cpp | 171 ++++++++-------
offload/DeviceRTL/src/Synchronization.cpp | 204 +++++++++--------
offload/DeviceRTL/src/Tasking.cpp | 52 ++---
offload/DeviceRTL/src/Workshare.cpp | 216 +++++++++---------
28 files changed, 849 insertions(+), 736 deletions(-)
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 099634e211e7a7..e6859ab3d9e9e3 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -98,7 +98,7 @@ list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
${clang_opt_flags} --offload-device-only
-nocudalib -nogpulib -nogpuinc -nostdlibinc
- -fopenmp -fopenmp-cuda-mode
+ -fopenmp -fopenmp-cuda-mode -Wno-unknown-assumption
-Wno-unknown-cuda-version -Wno-openmp-target
-DOMPTARGET_DEVICE_RUNTIME
-I${include_directory}
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 475f6a21bb47eb..d3ff7185bb29bb 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -26,22 +26,23 @@ namespace allocator {
static uint64_t constexpr ALIGNMENT = 16;
/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
+OMP_ATTRS void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
/// Allocate \p Size bytes.
-[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
+[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT),
+ gnu::malloc]] OMP_ATTRS void *
alloc(uint64_t Size);
/// Free the allocation pointed to by \p Ptr.
-void free(void *Ptr);
+OMP_ATTRS void free(void *Ptr);
} // namespace allocator
} // namespace ompx
extern "C" {
-[[gnu::weak]] void *malloc(size_t Size);
-[[gnu::weak]] void free(void *Ptr);
+[[gnu::weak]] OMP_ATTRS void *malloc(size_t Size);
+[[gnu::weak]] OMP_ATTRS void free(void *Ptr);
}
#pragma omp end declare target
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
index f8b7a6c3c6c9da..cf638838a7d382 100644
--- a/offload/DeviceRTL/include/Configuration.h
+++ b/offload/DeviceRTL/include/Configuration.h
@@ -22,45 +22,45 @@ namespace config {
/// Return the number of devices in the system, same number as returned on the
/// host by omp_get_num_devices.
-uint32_t getNumDevices();
+OMP_ATTRS uint32_t getNumDevices();
/// Return the device number in the system for omp_get_device_num.
-uint32_t getDeviceNum();
+OMP_ATTRS uint32_t getDeviceNum();
/// Return the user choosen debug level.
-uint32_t getDebugKind();
+OMP_ATTRS uint32_t getDebugKind();
/// Return if teams oversubscription is assumed
-uint32_t getAssumeTeamsOversubscription();
+OMP_ATTRS uint32_t getAssumeTeamsOversubscription();
/// Return if threads oversubscription is assumed
-uint32_t getAssumeThreadsOversubscription();
+OMP_ATTRS uint32_t getAssumeThreadsOversubscription();
/// Return the amount of dynamic shared memory that was allocated at launch.
-uint64_t getDynamicMemorySize();
+OMP_ATTRS uint64_t getDynamicMemorySize();
/// Returns the cycles per second of the device's fixed frequency clock.
-uint64_t getClockFrequency();
+OMP_ATTRS uint64_t getClockFrequency();
/// Returns the pointer to the beginning of the indirect call table.
-void *getIndirectCallTablePtr();
+OMP_ATTRS void *getIndirectCallTablePtr();
/// Returns the size of the indirect call table.
-uint64_t getIndirectCallTableSize();
+OMP_ATTRS uint64_t getIndirectCallTableSize();
/// Returns the size of the indirect call table.
-uint64_t getHardwareParallelism();
+OMP_ATTRS uint64_t getHardwareParallelism();
/// Return if debugging is enabled for the given debug kind.
-bool isDebugMode(DeviceDebugKind Level);
+OMP_ATTRS bool isDebugMode(DeviceDebugKind Level);
/// Indicates if this kernel may require thread-specific states, or if it was
/// explicitly disabled by the user.
-bool mayUseThreadStates();
+OMP_ATTRS bool mayUseThreadStates();
/// Indicates if this kernel may require data environments for nested
/// parallelism, or if it was explicitly disabled by the user.
-bool mayUseNestedParallelism();
+OMP_ATTRS bool mayUseNestedParallelism();
} // namespace config
} // namespace ompx
diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h
index 22998f44a5bea5..31b465fe425b39 100644
--- a/offload/DeviceRTL/include/Debug.h
+++ b/offload/DeviceRTL/include/Debug.h
@@ -19,11 +19,12 @@
///
/// {
extern "C" {
-void __assert_assume(bool condition);
-void __assert_fail(const char *expr, const char *file, unsigned line,
- const char *function);
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
- unsigned line, const char *function);
+OMP_ATTRS void __assert_assume(bool condition);
+OMP_ATTRS void __assert_fail(const char *expr, const char *file, unsigned line,
+ const char *function);
+OMP_ATTRS void __assert_fail_internal(const char *expr, const char *msg,
+ const char *file, unsigned line,
+ const char *function);
}
#define ASSERT(expr, msg) \
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 259bc008f91d13..404c2d7ca8d5ef 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -20,9 +20,9 @@
// another function but only inline assembly that performs some operation or
// side-effect and then continues execution with something on the existing call
// stack.
-//
-// TODO: Find a good place for this
-#pragma omp assumes ext_no_call_asm
+#pragma omp begin declare variant match(device = {kind(gpu)})
+#define OMP_ATTRS [[omp::assume("ext_no_call_asm")]]
+#pragma omp end declare variant
enum omp_proc_bind_t {
omp_proc_bind_false = 0,
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index fa66b973a4f5e7..fddd0c8722f3f2 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -60,32 +60,35 @@ struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {};
template <class T>
using remove_addrspace_t = typename remove_addrspace<T>::type;
-template <typename To, typename From> inline To bitCast(From V) {
+template <typename To, typename From> OMP_ATTRS inline To bitCast(From V) {
static_assert(sizeof(To) == sizeof(From), "Bad conversion");
return __builtin_bit_cast(To, V);
}
/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
+OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+ int32_t Width);
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
+OMP_ATTRS int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
+ int32_t Width);
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred);
/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+OMP_ATTRS uint64_t pack(uint32_t LowBits, uint32_t HighBits);
/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+OMP_ATTRS void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
+OMP_ATTRS bool isSharedMemPtr(void *Ptr);
/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
-bool isThreadLocalMemPtr(void *Ptr);
+OMP_ATTRS bool isThreadLocalMemPtr(void *Ptr);
/// A pointer variable that has by design an `undef` value. Use with care.
[[clang::loader_uninitialized]] static void *const UndefPtr;
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4f..cb0bfed8edc9df 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -28,8 +28,8 @@ extern "C" {
/// getter: returns 0.
///
///{
-void omp_set_dynamic(int);
-int omp_get_dynamic(void);
+OMP_ATTRS void omp_set_dynamic(int);
+OMP_ATTRS int omp_get_dynamic(void);
///}
/// ICV: nthreads-var, integer
@@ -43,8 +43,8 @@ int omp_get_dynamic(void);
///
///
///{
-void omp_set_num_threads(int);
-int omp_get_max_threads(void);
+OMP_ATTRS void omp_set_num_threads(int);
+OMP_ATTRS int omp_get_max_threads(void);
///}
/// ICV: thread-limit-var, computed
@@ -52,7 +52,7 @@ int omp_get_max_threads(void);
/// getter: returns thread limited defined during launch.
///
///{
-int omp_get_thread_limit(void);
+OMP_ATTRS int omp_get_thread_limit(void);
///}
/// ICV: max-active-level-var, constant 1
@@ -61,8 +61,8 @@ int omp_get_thread_limit(void);
/// getter: returns 1.
///
///{
-void omp_set_max_active_levels(int);
-int omp_get_max_active_levels(void);
+OMP_ATTRS void omp_set_max_active_levels(int);
+OMP_ATTRS int omp_get_max_active_levels(void);
///}
/// ICV: places-partition-var
@@ -76,7 +76,7 @@ int omp_get_max_active_levels(void);
/// getter: returns 0 or 1.
///
///{
-int omp_get_active_level(void);
+OMP_ATTRS int omp_get_active_level(void);
///}
/// ICV: level-var
@@ -84,88 +84,88 @@ int omp_get_active_level(void);
/// getter: returns parallel region nesting
///
///{
-int omp_get_level(void);
+OMP_ATTRS int omp_get_level(void);
///}
/// ICV: run-sched-var
///
///
///{
-void omp_set_schedule(omp_sched_t, int);
-void omp_get_schedule(omp_sched_t *, int *);
+OMP_ATTRS void omp_set_schedule(omp_sched_t, int);
+OMP_ATTRS void omp_get_schedule(omp_sched_t *, int *);
///}
/// TODO this is incomplete.
-int omp_get_num_threads(void);
-int omp_get_thread_num(void);
-void omp_set_nested(int);
+OMP_ATTRS int omp_get_num_threads(void);
+OMP_ATTRS int omp_get_thread_num(void);
+OMP_ATTRS void omp_set_nested(int);
-int omp_get_nested(void);
+OMP_ATTRS int omp_get_nested(void);
-void omp_set_max_active_levels(int Level);
+OMP_ATTRS void omp_set_max_active_levels(int Level);
-int omp_get_max_active_levels(void);
+OMP_ATTRS int omp_get_max_active_levels(void);
-omp_proc_bind_t omp_get_proc_bind(void);
+OMP_ATTRS omp_proc_bind_t omp_get_proc_bind(void);
-int omp_get_num_places(void);
+OMP_ATTRS int omp_get_num_places(void);
-int omp_get_place_num_procs(int place_num);
+OMP_ATTRS int omp_get_place_num_procs(int place_num);
-void omp_get_place_proc_ids(int place_num, int *ids);
+OMP_ATTRS void omp_get_place_proc_ids(int place_num, int *ids);
-int omp_get_place_num(void);
+OMP_ATTRS int omp_get_place_num(void);
-int omp_get_partition_num_places(void);
+OMP_ATTRS int omp_get_partition_num_places(void);
-void omp_get_partition_place_nums(int *place_nums);
+OMP_ATTRS void omp_get_partition_place_nums(int *place_nums);
-int omp_get_cancellation(void);
+OMP_ATTRS int omp_get_cancellation(void);
-void omp_set_default_device(int deviceId);
+OMP_ATTRS void omp_set_default_device(int deviceId);
-int omp_get_default_device(void);
+OMP_ATTRS int omp_get_default_device(void);
-int omp_get_num_devices(void);
+OMP_ATTRS int omp_get_num_devices(void);
-int omp_get_device_num(void);
+OMP_ATTRS int omp_get_device_num(void);
-int omp_get_num_teams(void);
+OMP_ATTRS int omp_get_num_teams(void);
-int omp_get_team_num();
+OMP_ATTRS int omp_get_team_num();
-int omp_get_initial_device(void);
+OMP_ATTRS int omp_get_initial_device(void);
-void *llvm_omp_target_dynamic_shared_alloc();
+OMP_ATTRS void *llvm_omp_target_dynamic_shared_alloc();
/// Synchronization
///
///{
-void omp_init_lock(omp_lock_t *Lock);
+OMP_ATTRS void omp_init_lock(omp_lock_t *Lock);
-void omp_destroy_lock(omp_lock_t *Lock);
+OMP_ATTRS void omp_destroy_lock(omp_lock_t *Lock);
-void omp_set_lock(omp_lock_t *Lock);
+OMP_ATTRS void omp_set_lock(omp_lock_t *Lock);
-void omp_unset_lock(omp_lock_t *Lock);
+OMP_ATTRS void omp_unset_lock(omp_lock_t *Lock);
-int omp_test_lock(omp_lock_t *Lock);
+OMP_ATTRS int omp_test_lock(omp_lock_t *Lock);
///}
/// Tasking
///
///{
-int omp_in_final(void);
+OMP_ATTRS int omp_in_final(void);
-int omp_get_max_task_priority(void);
+OMP_ATTRS int omp_get_max_task_priority(void);
///}
/// Misc
///
///{
-double omp_get_wtick(void);
+OMP_ATTRS double omp_get_wtick(void);
-double omp_get_wtime(void);
+OMP_ATTRS double omp_get_wtime(void);
///}
}
@@ -173,16 +173,16 @@ extern "C" {
/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
/// called by any thread, allocation happens *per thread*.
-void *__kmpc_alloc_shared(uint64_t Bytes);
+OMP_ATTRS void *__kmpc_alloc_shared(uint64_t Bytes);
/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
/// allocated by __kmpc_alloc_shared by the same thread.
-void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
+OMP_ATTRS void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
/// Get a pointer to the memory buffer containing dynamically allocated shared
/// memory configured at launch.
-void *__kmpc_get_dynamic_shared();
+OMP_ATTRS void *__kmpc_get_dynamic_shared();
/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
/// allocation address in \p GlobalArgs.
@@ -191,27 +191,28 @@ void *__kmpc_get_dynamic_shared();
///
/// We also remember it in GlobalArgsPtr to ensure the worker threads and
/// deallocation function know the allocation address too.
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
+OMP_ATTRS void __kmpc_begin_sharing_variables(void ***GlobalArgs,
+ uint64_t NumArgs);
/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
///
/// Called by the main thread after a parallel region.
-void __kmpc_end_sharing_variables();
+OMP_ATTRS void __kmpc_end_sharing_variables();
/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
/// \p GlobalArgs.
///
/// Called by the worker threads in the parallel region (function).
-void __kmpc_get_shared_variables(void ***GlobalArgs);
+OMP_ATTRS void __kmpc_get_shared_variables(void ***GlobalArgs);
/// External interface to get the thread ID.
-uint32_t __kmpc_get_hardware_thread_id_in_block();
+OMP_ATTRS uint32_t __kmpc_get_hardware_thread_id_in_block();
/// External interface to get the number of threads.
-uint32_t __kmpc_get_hardware_num_threads_in_block();
+OMP_ATTRS uint32_t __kmpc_get_hardware_num_threads_in_block();
/// External interface to get the warp size.
-uint32_t __kmpc_get_warp_size();
+OMP_ATTRS uint32_t __kmpc_get_warp_size();
/// Kernel
///
@@ -219,27 +220,26 @@ uint32_t __kmpc_get_warp_size();
// Forward declaration
struct KernelEnvironmentTy;
-int8_t __kmpc_is_spmd_exec_mode();
+OMP_ATTRS int8_t __kmpc_is_spmd_exec_mode();
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+OMP_ATTRS int32_t
+__kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-void __kmpc_target_deinit();
+OMP_ATTRS void __kmpc_target_deinit();
///}
/// Reduction
///
///{
-void *__kmpc_reduction_get_fixed_buffer();
+OMP_ATTRS void *__kmpc_reduction_get_fixed_buffer();
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
- uint64_t reduce_data_size,
- void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct);
+OMP_ATTRS int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+ IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data,
+ ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+OMP_ATTRS int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
@@ -249,116 +249,120 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
/// Synchronization
///
///{
-void __kmpc_ordered(IdentTy *Loc, int32_t TId);
+OMP_ATTRS void __kmpc_ordered(IdentTy *Loc, int32_t TId);
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
+OMP_ATTRS void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
-int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
+OMP_ATTRS int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
-void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
+OMP_ATTRS void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
-void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
+OMP_ATTRS void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
-void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
+OMP_ATTRS void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
+OMP_ATTRS int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
-void __kmpc_end_master(IdentTy *Loc, int32_t TId);
+OMP_ATTRS void __kmpc_end_master(IdentTy *Loc, int32_t TId);
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter);
+OMP_ATTRS int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter);
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId);
+OMP_ATTRS void __kmpc_end_masked(IdentTy *Loc, int32_t TId);
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
+OMP_ATTRS int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
-void __kmpc_end_single(IdentTy *Loc, int32_t TId);
+OMP_ATTRS void __kmpc_end_single(IdentTy *Loc, int32_t TId);
-void __kmpc_flush(IdentTy *Loc);
+OMP_ATTRS void __kmpc_flush(IdentTy *Loc);
-uint64_t __kmpc_warp_active_thread_mask(void);
+OMP_ATTRS uint64_t __kmpc_warp_active_thread_mask(void);
-void __kmpc_syncwarp(uint64_t Mask);
+OMP_ATTRS void __kmpc_syncwarp(uint64_t Mask);
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+OMP_ATTRS void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+OMP_ATTRS void __kmpc_end_critical(IdentTy *Loc, int32_t TId,
+ CriticalNameTy *Name);
///}
/// Parallelism
///
///{
/// TODO
-void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
+OMP_ATTRS void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
/// TODO
-bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
+OMP_ATTRS bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
/// TODO
-void __kmpc_kernel_end_parallel();
+OMP_ATTRS void __kmpc_kernel_end_parallel();
/// TODO
-void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
+OMP_ATTRS void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
/// TODO
-void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
- int32_t ThreadLimit);
+OMP_ATTRS void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId,
+ int32_t NumTeams, int32_t ThreadLimit);
/// TODO
-uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
+OMP_ATTRS uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
///}
/// Tasking
///
///{
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
- size_t TaskSizeInclPrivateValues,
- size_t SharedValuesSize,
- TaskFnTy TaskFn);
+OMP_ATTRS TaskDescriptorTy *
+__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
+ size_t TaskSizeInclPrivateValues, size_t SharedValuesSize,
+ TaskFnTy TaskFn);
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
+OMP_ATTRS int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor);
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int32_t,
- void *, int32_t, void *);
+OMP_ATTRS int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor,
+ int32_t, void *, int32_t, void *);
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
+OMP_ATTRS void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor);
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
+OMP_ATTRS void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor);
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
- void *);
+OMP_ATTRS void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *,
+ int32_t, void *);
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
+OMP_ATTRS void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
+OMP_ATTRS void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
+OMP_ATTRS int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
+OMP_ATTRS int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int,
- uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
- int32_t, uint64_t, void *);
+OMP_ATTRS void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor, int,
+ uint64_t *LowerBound, uint64_t *UpperBound,
+ int64_t, int, int32_t, uint64_t, void *);
///}
/// Misc
///
///{
-int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+OMP_ATTRS int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId,
+ int32_t CancelVal);
-int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+OMP_ATTRS int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
///}
/// Shuffle
///
///{
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+OMP_ATTRS int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta,
+ int16_t size);
+OMP_ATTRS int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta,
+ int16_t size);
///}
}
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index 03febdb5083423..9c722d1ac28f80 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -16,9 +16,9 @@
extern "C" {
-int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
-int printf(const char *format, ...);
+OMP_ATTRS int memcmp(const void *lhs, const void *rhs, size_t count);
+OMP_ATTRS void memset(void *dst, int C, size_t count);
+OMP_ATTRS int printf(const char *format, ...);
}
#endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2fb87abe5418c0..92afb41f10e908 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -31,67 +31,67 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024;
#pragma omp end declare target
/// Initialize the mapping machinery.
-void init(bool IsSPMD);
+OMP_ATTRS void init(bool IsSPMD);
/// Return true if the kernel is executed in SPMD mode.
-bool isSPMDMode();
+OMP_ATTRS bool isSPMDMode();
/// Return true if the kernel is executed in generic mode.
-bool isGenericMode();
+OMP_ATTRS bool isGenericMode();
/// Return true if the executing thread is the main thread in generic mode.
/// These functions will lookup state and it is required that that is OK for the
/// thread and location. See also `isInitialThreadInLevel0` for a stateless
/// alternative for certain situations, e.g. during initialization.
-bool isMainThreadInGenericMode();
-bool isMainThreadInGenericMode(bool IsSPMD);
+OMP_ATTRS bool isMainThreadInGenericMode();
+OMP_ATTRS bool isMainThreadInGenericMode(bool IsSPMD);
/// Return true if this thread is the initial thread in parallel level 0.
///
/// The thread for which this returns true should be used for single threaded
/// initialization tasks. We pick a special thread to ensure there are no
/// races between the initialization and the first read of initialized state.
-bool isInitialThreadInLevel0(bool IsSPMD);
+OMP_ATTRS bool isInitialThreadInLevel0(bool IsSPMD);
/// Return true if the executing thread has the lowest Id of the active threads
/// in the warp.
-bool isLeaderInWarp();
+OMP_ATTRS bool isLeaderInWarp();
/// Return a mask describing all active threads in the warp.
-LaneMaskTy activemask();
+OMP_ATTRS LaneMaskTy activemask();
/// Return a mask describing all threads with a smaller Id in the warp.
-LaneMaskTy lanemaskLT();
+OMP_ATTRS LaneMaskTy lanemaskLT();
/// Return a mask describing all threads with a larget Id in the warp.
-LaneMaskTy lanemaskGT();
+OMP_ATTRS LaneMaskTy lanemaskGT();
/// Return the thread Id in the warp, in [0, getWarpSize()).
-uint32_t getThreadIdInWarp();
+OMP_ATTRS uint32_t getThreadIdInWarp();
/// Return the warp size, thus number of threads in the warp.
-uint32_t getWarpSize();
+OMP_ATTRS uint32_t getWarpSize();
/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
-uint32_t getWarpIdInBlock();
+OMP_ATTRS uint32_t getWarpIdInBlock();
/// Return the number of warps in the block.
-uint32_t getNumberOfWarpsInBlock();
+OMP_ATTRS uint32_t getNumberOfWarpsInBlock();
/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
-uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
+OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
/// Return the block size, thus number of threads in the block.
-uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
+OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
-uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
+OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
/// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
+OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
/// Return the kernel size, thus number of threads in the kernel.
-uint32_t getNumberOfThreadsInKernel();
+OMP_ATTRS uint32_t getNumberOfThreadsInKernel();
/// Return the maximal number of threads in the block usable for a team (=
/// parallel region).
@@ -99,11 +99,11 @@ uint32_t getNumberOfThreadsInKernel();
/// Note: The version taking \p IsSPMD mode explicitly can be used during the
/// initialization of the target region, that is before `mapping::isSPMDMode()`
/// can be called by any thread other than the main one.
-uint32_t getMaxTeamThreads();
-uint32_t getMaxTeamThreads(bool IsSPMD);
+OMP_ATTRS uint32_t getMaxTeamThreads();
+OMP_ATTRS uint32_t getMaxTeamThreads(bool IsSPMD);
/// Return the number of processing elements on the device.
-uint32_t getNumberOfProcessorElements();
+OMP_ATTRS uint32_t getNumberOfProcessorElements();
} // namespace mapping
diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h
index d9947522541219..560581939c540b 100644
--- a/offload/DeviceRTL/include/Profiling.h
+++ b/offload/DeviceRTL/include/Profiling.h
@@ -12,10 +12,12 @@
#ifndef OMPTARGET_DEVICERTL_PROFILING_H
#define OMPTARGET_DEVICERTL_PROFILING_H
+#include "DeviceTypes.h"
+
extern "C" {
-void __llvm_profile_register_function(void *Ptr);
-void __llvm_profile_register_names_function(void *Ptr, long int I);
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
+OMP_ATTRS void __llvm_profile_register_function(void *Ptr);
+OMP_ATTRS void __llvm_profile_register_names_function(void *Ptr, long int I);
+OMP_ATTRS void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
}
#endif
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 565235cd48a913..f491d88adbe398 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -31,21 +31,21 @@ namespace memory {
/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
-void *allocShared(uint64_t Size, const char *Reason);
+OMP_ATTRS void *allocShared(uint64_t Size, const char *Reason);
/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
-void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
+OMP_ATTRS void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
-void *allocGlobal(uint64_t Size, const char *Reason);
+OMP_ATTRS void *allocGlobal(uint64_t Size, const char *Reason);
/// Return a pointer to the dynamic shared memory buffer.
-void *getDynamicBuffer();
+OMP_ATTRS void *getDynamicBuffer();
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
-void freeGlobal(void *Ptr, const char *Reason);
+OMP_ATTRS void freeGlobal(void *Ptr, const char *Reason);
} // namespace memory
@@ -62,17 +62,17 @@ struct ICVStateTy {
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;
- bool operator==(const ICVStateTy &Other) const;
+ OMP_ATTRS bool operator==(const ICVStateTy &Other) const;
- void assertEqual(const ICVStateTy &Other) const;
+ OMP_ATTRS void assertEqual(const ICVStateTy &Other) const;
};
struct TeamStateTy {
- void init(bool IsSPMD);
+ OMP_ATTRS void init(bool IsSPMD);
- bool operator==(const TeamStateTy &) const;
+ OMP_ATTRS bool operator==(const TeamStateTy &) const;
- void assertEqual(TeamStateTy &Other) const;
+ OMP_ATTRS void assertEqual(TeamStateTy &Other) const;
/// ICVs
///
@@ -104,12 +104,12 @@ struct ThreadStateTy {
ThreadStateTy *PreviousThreadState;
- void init() {
+ OMP_ATTRS void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}
- void init(ThreadStateTy *PreviousTS) {
+ OMP_ATTRS void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
@@ -119,15 +119,15 @@ extern ThreadStateTy **ThreadStates;
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
/// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+OMP_ATTRS void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
/// holds for all instances of the kernel. The latter is dynamic and provides
/// per-launch information.
-KernelEnvironmentTy &getKernelEnvironment();
-KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
+OMP_ATTRS KernelEnvironmentTy &getKernelEnvironment();
+OMP_ATTRS KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
/// TODO
enum ValueKind {
@@ -144,22 +144,23 @@ enum ValueKind {
};
/// TODO
-void enterDataEnvironment(IdentTy *Ident);
+OMP_ATTRS void enterDataEnvironment(IdentTy *Ident);
/// TODO
-void exitDataEnvironment();
+OMP_ATTRS void exitDataEnvironment();
/// TODO
struct DateEnvironmentRAII {
- DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
- ~DateEnvironmentRAII() { exitDataEnvironment(); }
+ OMP_ATTRS DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
+ OMP_ATTRS ~DateEnvironmentRAII() { exitDataEnvironment(); }
};
/// TODO
-void resetStateForThread(uint32_t TId);
+OMP_ATTRS void resetStateForThread(uint32_t TId);
-inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
- IdentTy *Ident, bool ForceTeamState) {
+OMP_ATTRS inline uint32_t &
+lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, IdentTy *Ident,
+ bool ForceTeamState) {
if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
!TeamState.HasThreadState))
return TeamState.ICVState.*Var;
@@ -174,8 +175,8 @@ inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
return ThreadStates[TId]->ICVState.*Var;
}
-inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
- bool ForceTeamState) {
+OMP_ATTRS inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
+ bool ForceTeamState) {
auto TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
TeamState.HasThreadState && ThreadStates[TId]))
@@ -183,7 +184,7 @@ inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
return TeamState.ICVState.*Var;
}
-[[gnu::always_inline, gnu::flatten]] inline uint32_t &
+[[gnu::always_inline, gnu::flatten]] OMP_ATTRS inline uint32_t &
lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
switch (Kind) {
case state::VK_NThreads:
@@ -225,7 +226,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
__builtin_unreachable();
}
-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] OMP_ATTRS inline void *&
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:
@@ -239,45 +240,48 @@ lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
/// A class without actual state used to provide a nice interface to lookup and
/// update ICV values we can declare in global scope.
template <typename Ty, ValueKind Kind> struct Value {
- [[gnu::flatten, gnu::always_inline]] operator Ty() {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS operator Ty() {
return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false);
}
- [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value &
+ operator=(const Ty &Other) {
set(Other, /*IdentTy=*/nullptr);
return *this;
}
- [[gnu::flatten, gnu::always_inline]] Value &operator++() {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value &operator++() {
inc(1, /*IdentTy=*/nullptr);
return *this;
}
- [[gnu::flatten, gnu::always_inline]] Value &operator--() {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value &operator--() {
inc(-1, /*IdentTy=*/nullptr);
return *this;
}
- [[gnu::flatten, gnu::always_inline]] void
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS void
assert_eq(const Ty &V, IdentTy *Ident = nullptr,
bool ForceTeamState = false) {
ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
}
private:
- [[gnu::flatten, gnu::always_inline]] Ty &
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty &
lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
return t;
}
- [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty &inc(int UpdateVal,
+ IdentTy *Ident) {
return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
UpdateVal);
}
- [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty &set(Ty UpdateVal,
+ IdentTy *Ident) {
return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
UpdateVal);
}
@@ -289,22 +293,23 @@ template <typename Ty, ValueKind Kind> struct Value {
/// a nice interface to lookup and update ICV values
/// we can declare in global scope.
template <typename Ty, ValueKind Kind> struct PtrValue {
- [[gnu::flatten, gnu::always_inline]] operator Ty() {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS operator Ty() {
return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false);
}
- [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
+ [[gnu::flatten, gnu::always_inline]] OMP_ATTRS PtrValue &
+ operator=(const Ty Other) {
set(Other);
return *this;
}
private:
- Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
+ OMP_ATTRS Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
return lookupPtr(Kind, IsReadonly, ForceTeamState);
}
- Ty &set(Ty UpdateVal) {
+ OMP_ATTRS Ty &set(Ty UpdateVal) {
return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false) = UpdateVal);
}
@@ -313,8 +318,8 @@ template <typename Ty, ValueKind Kind> struct PtrValue {
};
template <typename VTy, typename Ty> struct ValueRAII {
- ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
- bool ForceTeamState = false)
+ OMP_ATTRS ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active,
+ IdentTy *Ident, bool ForceTeamState = false)
: Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
: (Ty *)utils::UndefPtr),
Val(OldValue), Active(Active) {
@@ -323,7 +328,7 @@ template <typename VTy, typename Ty> struct ValueRAII {
ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
*Ptr = NewValue;
}
- ~ValueRAII() {
+ OMP_ATTRS ~ValueRAII() {
if (Active)
*Ptr = Val;
}
@@ -347,12 +352,12 @@ inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
ParallelRegionFn;
-void runAndCheckState(void(Func(void)));
+OMP_ATTRS void runAndCheckState(void(Func(void)));
-void assumeInitialState(bool IsSPMD);
+OMP_ATTRS void assumeInitialState(bool IsSPMD);
/// Return the value of the ParallelTeamSize ICV.
-int getEffectivePTeamSize();
+OMP_ATTRS int getEffectivePTeamSize();
} // namespace state
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index e1968675550d49..4691b281761dd5 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -43,39 +43,40 @@ enum MemScopeTy {
};
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
-uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::all);
+OMP_ATTRS uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
+ MemScopeTy MemScope = MemScopeTy::all);
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
/// result is stored in \p *Addr;
/// {
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
- atomic::OrderingTy OrderingFail) {
+OMP_ATTRS bool cas(Ty *Address, V ExpectedV, V DesiredV,
+ atomic::OrderingTy OrderingSucc,
+ atomic::OrderingTy OrderingFail) {
return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
OrderingSucc, OrderingFail,
__MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V add(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS V add(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_add(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V load(Ty *Address, atomic::OrderingTy Ordering) {
+OMP_ATTRS V load(Ty *Address, atomic::OrderingTy Ordering) {
return add(Address, Ty(0), Ordering);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-void store(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS void store(Ty *Address, V Val, atomic::OrderingTy Ordering) {
__scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) {
Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
bool Success;
do {
@@ -88,14 +89,14 @@ V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) {
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
+OMP_ATTRS utils::enable_if_t<!utils::is_floating_point_v<V>, V>
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_max(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
+OMP_ATTRS utils::enable_if_t<utils::is_same_v<V, float>, V>
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
if (Val >= 0)
return utils::bitCast<float>(
@@ -105,7 +106,7 @@ max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
+OMP_ATTRS utils::enable_if_t<utils::is_same_v<V, double>, V>
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
if (Val >= 0)
return utils::bitCast<double>(
@@ -115,7 +116,7 @@ max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
+OMP_ATTRS utils::enable_if_t<!utils::is_floating_point_v<V>, V>
min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_min(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
@@ -123,7 +124,7 @@ min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
// TODO: Implement this with __atomic_fetch_max and remove the duplication.
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
+OMP_ATTRS utils::enable_if_t<utils::is_same_v<V, float>, V>
min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
if (Val >= 0)
return utils::bitCast<float>(
@@ -134,7 +135,7 @@ min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
// TODO: Implement this with __atomic_fetch_max and remove the duplication.
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
+OMP_ATTRS utils::enable_if_t<utils::is_same_v<V, double>, V>
min(Ty *Address, utils::remove_addrspace_t<Ty> Val,
atomic::OrderingTy Ordering) {
if (Val >= 0)
@@ -145,25 +146,25 @@ min(Ty *Address, utils::remove_addrspace_t<Ty> Val,
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_or(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_and(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) {
+OMP_ATTRS V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) {
return __scoped_atomic_fetch_xor(Address, Val, Ordering,
__MEMORY_SCOPE_DEVICE);
}
-static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
- atomic::OrderingTy Ordering) {
+OMP_ATTRS static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
+ atomic::OrderingTy Ordering) {
uint32_t R;
__scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE);
return R;
@@ -176,15 +177,15 @@ static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
namespace synchronize {
/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
+OMP_ATTRS void init(bool IsSPMD);
/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
+OMP_ATTRS void warp(LaneMaskTy Mask);
/// Synchronize all threads in a block and perform a fence before and after the
/// barrier according to \p Ordering. Note that the fence might be part of the
/// barrier.
-void threads(atomic::OrderingTy Ordering);
+OMP_ATTRS void threads(atomic::OrderingTy Ordering);
/// Synchronizing threads is allowed even if they all hit different instances of
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
@@ -198,7 +199,7 @@ void threads(atomic::OrderingTy Ordering);
/// (hence all threads in the block are "aligned"). Also perform a fence before
/// and after the barrier according to \p Ordering. Note that the
/// fence might be part of the barrier if the target offers this.
-[[gnu::noinline]] void threadsAligned(atomic::OrderingTy Ordering);
+[[gnu::noinline]] OMP_ATTRS void threadsAligned(atomic::OrderingTy Ordering);
#pragma omp end assumes
///}
@@ -208,13 +209,13 @@ void threads(atomic::OrderingTy Ordering);
namespace fence {
/// Memory fence with \p Ordering semantics for the team.
-void team(atomic::OrderingTy Ordering);
+OMP_ATTRS void team(atomic::OrderingTy Ordering);
/// Memory fence with \p Ordering semantics for the contention group.
-void kernel(atomic::OrderingTy Ordering);
+OMP_ATTRS void kernel(atomic::OrderingTy Ordering);
/// Memory fence with \p Ordering semantics for the system.
-void system(atomic::OrderingTy Ordering);
+OMP_ATTRS void system(atomic::OrderingTy Ordering);
} // namespace fence
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
index fa9b3b2430b8c4..14127b1841e6cd 100644
--- a/offload/DeviceRTL/include/Workshare.h
+++ b/offload/DeviceRTL/include/Workshare.h
@@ -12,6 +12,8 @@
#ifndef OMPTARGET_WORKSHARE_H
#define OMPTARGET_WORKSHARE_H
+#include "DeviceTypes.h"
+
#pragma omp begin declare target device_type(nohost)
namespace ompx {
@@ -19,7 +21,7 @@ namespace ompx {
namespace workshare {
/// Initialize the worksharing machinery.
-void init(bool IsSPMD);
+OMP_ATTRS void init(bool IsSPMD);
} // namespace workshare
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index ac662c48d4f5fb..c970742b8b596b 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -32,7 +32,7 @@ using namespace ompx;
/// directly.
struct BumpAllocatorTy final {
- void *alloc(uint64_t Size) {
+ OMP_ATTRS void *alloc(uint64_t Size) {
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
@@ -58,7 +58,7 @@ struct BumpAllocatorTy final {
return reinterpret_cast<void *>(OldData);
}
- void free(void *) {}
+ OMP_ATTRS void free(void *) {}
};
BumpAllocatorTy BumpAllocator;
@@ -67,14 +67,17 @@ BumpAllocatorTy BumpAllocator;
///
///{
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
+OMP_ATTRS void allocator::init(bool IsSPMD,
+ KernelEnvironmentTy &KernelEnvironment) {
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
// more than one.
}
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
+OMP_ATTRS void *allocator::alloc(uint64_t Size) {
+ return BumpAllocator.alloc(Size);
+}
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+OMP_ATTRS void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
///}
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index 9e14c203d4a04e..8ef990c5b1a5f6 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -33,54 +33,56 @@ using namespace ompx;
gnu::visibility("protected")]] DeviceEnvironmentTy
CONSTANT(__omp_rtl_device_environment);
-uint32_t config::getAssumeTeamsOversubscription() {
+OMP_ATTRS uint32_t config::getAssumeTeamsOversubscription() {
return __omp_rtl_assume_teams_oversubscription;
}
-uint32_t config::getAssumeThreadsOversubscription() {
+OMP_ATTRS uint32_t config::getAssumeThreadsOversubscription() {
return __omp_rtl_assume_threads_oversubscription;
}
-uint32_t config::getDebugKind() {
+OMP_ATTRS uint32_t config::getDebugKind() {
return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind;
}
-uint32_t config::getNumDevices() {
+OMP_ATTRS uint32_t config::getNumDevices() {
return __omp_rtl_device_environment.NumDevices;
}
-uint32_t config::getDeviceNum() {
+OMP_ATTRS uint32_t config::getDeviceNum() {
return __omp_rtl_device_environment.DeviceNum;
}
-uint64_t config::getDynamicMemorySize() {
+OMP_ATTRS uint64_t config::getDynamicMemorySize() {
return __omp_rtl_device_environment.DynamicMemSize;
}
-uint64_t config::getClockFrequency() {
+OMP_ATTRS uint64_t config::getClockFrequency() {
return __omp_rtl_device_environment.ClockFrequency;
}
-void *config::getIndirectCallTablePtr() {
+OMP_ATTRS void *config::getIndirectCallTablePtr() {
return reinterpret_cast<void *>(
__omp_rtl_device_environment.IndirectCallTable);
}
-uint64_t config::getHardwareParallelism() {
+OMP_ATTRS uint64_t config::getHardwareParallelism() {
return __omp_rtl_device_environment.HardwareParallelism;
}
-uint64_t config::getIndirectCallTableSize() {
+OMP_ATTRS uint64_t config::getIndirectCallTableSize() {
return __omp_rtl_device_environment.IndirectCallTableSize;
}
-bool config::isDebugMode(DeviceDebugKind Kind) {
+OMP_ATTRS bool config::isDebugMode(DeviceDebugKind Kind) {
return config::getDebugKind() & uint32_t(Kind);
}
-bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
+OMP_ATTRS bool config::mayUseThreadStates() {
+ return !__omp_rtl_assume_no_thread_state;
+}
-bool config::mayUseNestedParallelism() {
+OMP_ATTRS bool config::mayUseNestedParallelism() {
if (__omp_rtl_assume_no_nested_parallelism)
return false;
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index b451f17c6bbd89..07743b58898d9c 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -24,17 +24,19 @@ using namespace ompx;
#pragma omp begin declare target device_type(nohost)
extern "C" {
-void __assert_assume(bool condition) { __builtin_assume(condition); }
+OMP_ATTRS void __assert_assume(bool condition) { __builtin_assume(condition); }
#ifndef OMPTARGET_HAS_LIBC
-[[gnu::weak]] void __assert_fail(const char *expr, const char *file,
- unsigned line, const char *function) {
+[[gnu::weak]] OMP_ATTRS void __assert_fail(const char *expr, const char *file,
+ unsigned line,
+ const char *function) {
__assert_fail_internal(expr, nullptr, file, line, function);
}
#endif
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
- unsigned line, const char *function) {
+OMP_ATTRS void __assert_fail_internal(const char *expr, const char *msg,
+ const char *file, unsigned line,
+ const char *function) {
if (msg) {
PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
msg, expr);
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index c204a7be73b1fc..41a8aae619d9c7 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -21,48 +21,50 @@ using namespace ompx;
namespace impl {
-bool isSharedMemPtr(const void *Ptr) { return false; }
+OMP_ATTRS bool isSharedMemPtr(const void *Ptr) { return false; }
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+OMP_ATTRS void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+OMP_ATTRS uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width);
+OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width);
+OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+ int32_t Width);
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred);
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
+OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width) {
+OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+ int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = Self + LaneDelta;
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return Mask & __builtin_amdgcn_ballot_w64(Pred);
}
-bool isSharedMemPtr(const void *Ptr) {
+OMP_ATTRS bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
}
@@ -76,45 +78,50 @@ bool isSharedMemPtr(const void *Ptr) {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
+OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
}
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+ int32_t Width) {
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
}
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
+OMP_ATTRS bool isSharedMemPtr(const void *Ptr) {
+ return __nvvm_isspacep_shared(Ptr);
+}
#pragma omp end declare variant
///}
} // namespace impl
-uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
+OMP_ATTRS uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
}
-void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
+OMP_ATTRS void utils::unpack(uint64_t Val, uint32_t &LowBits,
+ uint32_t &HighBits) {
impl::Unpack(Val, &LowBits, &HighBits);
}
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
- int32_t Width) {
+OMP_ATTRS int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+ int32_t Width) {
return impl::shuffle(Mask, Var, SrcLane, Width);
}
-int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
- int32_t Width) {
+OMP_ATTRS int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+ int32_t Width) {
return impl::shuffleDown(Mask, Var, Delta, Width);
}
-int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
- int32_t Width) {
+OMP_ATTRS int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
+ int32_t Width) {
uint32_t Lo, Hi;
utils::unpack(Var, Lo, Hi);
Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
@@ -122,18 +129,22 @@ int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
return utils::pack(Lo, Hi);
}
-uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
+OMP_ATTRS uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
return impl::ballotSync(Mask, Pred);
}
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+OMP_ATTRS bool utils::isSharedMemPtr(void *Ptr) {
+ return impl::isSharedMemPtr(Ptr);
+}
extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+OMP_ATTRS int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta,
+ int16_t SrcLane) {
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
}
-int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+OMP_ATTRS int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta,
+ int16_t Width) {
return utils::shuffleDown(lanes::All, Val, Delta, Width);
}
}
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6..dd6fb74e03d100 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -27,7 +27,7 @@ using namespace ompx;
#pragma omp begin declare target device_type(nohost)
-static void
+OMP_ATTRS static void
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
// Order is important here.
@@ -39,7 +39,7 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
}
/// Simple generic state machine for worker threads.
-static void genericStateMachine(IdentTy *Ident) {
+OMP_ATTRS static void genericStateMachine(IdentTy *Ident) {
uint32_t TId = mapping::getThreadIdInBlock();
do {
@@ -73,8 +73,9 @@ extern "C" {
///
/// \param Ident Source location identification, can be NULL.
///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+OMP_ATTRS int32_t
+__kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode &
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
@@ -130,7 +131,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
///
/// \param Ident Source location identification, can be NULL.
///
-void __kmpc_target_deinit() {
+OMP_ATTRS void __kmpc_target_deinit() {
bool IsSPMD = mapping::isSPMDMode();
if (IsSPMD)
return;
@@ -153,7 +154,7 @@ void __kmpc_target_deinit() {
}
}
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+OMP_ATTRS int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
}
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index 291ceb023a69c5..b11f4368a07b43 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -11,7 +11,7 @@
#pragma omp begin declare target device_type(nohost)
namespace impl {
-int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
+OMP_ATTRS int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
}
#ifndef OMPTARGET_HAS_LIBC
@@ -19,26 +19,27 @@ namespace impl {
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-extern "C" int vprintf(const char *format, ...);
-int omp_vprintf(const char *Format, __builtin_va_list vlist) {
+extern "C" OMP_ATTRS int vprintf(const char *format, ...);
+OMP_ATTRS int omp_vprintf(const char *Format, __builtin_va_list vlist) {
return vprintf(Format, vlist);
}
#pragma omp end declare variant
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
+OMP_ATTRS int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
#pragma omp end declare variant
} // namespace impl
-extern "C" int printf(const char *Format, ...) {
+extern "C" OMP_ATTRS int printf(const char *Format, ...) {
__builtin_va_list vlist;
- __builtin_va_start(vlist, Format);
+ OMP_ATTRS __builtin_va_start(vlist, Format);
return impl::omp_vprintf(Format, vlist);
}
#endif // OMPTARGET_HAS_LIBC
extern "C" {
-[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
+[[gnu::weak]] OMP_ATTRS int memcmp(const void *lhs, const void *rhs,
+ size_t count) {
auto *L = reinterpret_cast<const unsigned char *>(lhs);
auto *R = reinterpret_cast<const unsigned char *>(rhs);
@@ -49,7 +50,7 @@ extern "C" {
return 0;
}
-[[gnu::weak]] void memset(void *dst, int C, size_t count) {
+[[gnu::weak]] OMP_ATTRS void memset(void *dst, int C, size_t count) {
auto *dstc = reinterpret_cast<char *>(dst);
for (size_t I = 0; I < count; ++I)
dstc[I] = C;
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 8583a539824c82..885a19c14ba197 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,27 +25,27 @@ namespace ompx {
namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-LaneMaskTy activemask();
-LaneMaskTy lanemaskLT();
-LaneMaskTy lanemaskGT();
-uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInKernel();
-uint32_t getBlockIdInKernel(int32_t Dim);
-uint32_t getNumberOfBlocksInKernel(int32_t Dim);
-uint32_t getWarpIdInBlock();
-uint32_t getNumberOfWarpsInBlock();
-uint32_t getWarpSize();
+OMP_ATTRS LaneMaskTy activemask();
+OMP_ATTRS LaneMaskTy lanemaskLT();
+OMP_ATTRS LaneMaskTy lanemaskGT();
+OMP_ATTRS uint32_t getThreadIdInWarp();
+OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim);
+OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim);
+OMP_ATTRS uint32_t getNumberOfThreadsInKernel();
+OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim);
+OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim);
+OMP_ATTRS uint32_t getWarpIdInBlock();
+OMP_ATTRS uint32_t getNumberOfWarpsInBlock();
+OMP_ATTRS uint32_t getWarpSize();
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
+OMP_ATTRS uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_workgroup_size_x();
@@ -57,16 +57,16 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
+OMP_ATTRS LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-LaneMaskTy lanemaskLT() {
+OMP_ATTRS LaneMaskTy lanemaskLT() {
uint32_t Lane = mapping::getThreadIdInWarp();
int64_t Ballot = mapping::activemask();
uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
return Mask & Ballot;
}
-LaneMaskTy lanemaskGT() {
+OMP_ATTRS LaneMaskTy lanemaskGT() {
uint32_t Lane = mapping::getThreadIdInWarp();
if (Lane == (mapping::getWarpSize() - 1))
return 0;
@@ -75,11 +75,11 @@ LaneMaskTy lanemaskGT() {
return Mask & Ballot;
}
-uint32_t getThreadIdInWarp() {
+OMP_ATTRS uint32_t getThreadIdInWarp() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
-uint32_t getThreadIdInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_workitem_id_x();
@@ -91,12 +91,12 @@ uint32_t getThreadIdInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getNumberOfThreadsInKernel() {
+OMP_ATTRS uint32_t getNumberOfThreadsInKernel() {
return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
__builtin_amdgcn_grid_size_z();
}
-uint32_t getBlockIdInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_workgroup_id_x();
@@ -108,7 +108,7 @@ uint32_t getBlockIdInKernel(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -120,11 +120,11 @@ uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getWarpIdInBlock() {
+OMP_ATTRS uint32_t getWarpIdInBlock() {
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
}
-uint32_t getNumberOfWarpsInBlock() {
+OMP_ATTRS uint32_t getNumberOfWarpsInBlock() {
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
}
@@ -138,7 +138,7 @@ uint32_t getNumberOfWarpsInBlock() {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_ntid_x();
@@ -150,15 +150,15 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
+OMP_ATTRS uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-LaneMaskTy activemask() { return __nvvm_activemask(); }
+OMP_ATTRS LaneMaskTy activemask() { return __nvvm_activemask(); }
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
+OMP_ATTRS LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
+OMP_ATTRS LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-uint32_t getThreadIdInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_tid_x();
@@ -170,9 +170,9 @@ uint32_t getThreadIdInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
+OMP_ATTRS uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-uint32_t getBlockIdInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_ctaid_x();
@@ -184,7 +184,7 @@ uint32_t getBlockIdInKernel(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -196,7 +196,7 @@ uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-uint32_t getNumberOfThreadsInKernel() {
+OMP_ATTRS uint32_t getNumberOfThreadsInKernel() {
return impl::getNumberOfThreadsInBlock(0) *
impl::getNumberOfBlocksInKernel(0) *
impl::getNumberOfThreadsInBlock(1) *
@@ -205,11 +205,11 @@ uint32_t getNumberOfThreadsInKernel() {
impl::getNumberOfBlocksInKernel(2);
}
-uint32_t getWarpIdInBlock() {
+OMP_ATTRS uint32_t getWarpIdInBlock() {
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
}
-uint32_t getNumberOfWarpsInBlock() {
+OMP_ATTRS uint32_t getNumberOfWarpsInBlock() {
return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
mapping::getWarpSize();
}
@@ -224,13 +224,13 @@ uint32_t getNumberOfWarpsInBlock() {
/// below to avoid repeating assumptions or including irrelevant ones.
///{
-static bool isInLastWarp() {
+OMP_ATTRS static bool isInLastWarp() {
uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
~(mapping::getWarpSize() - 1);
return mapping::getThreadIdInBlock() == MainTId;
}
-bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
+OMP_ATTRS bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
if (IsSPMD || icv::Level)
return false;
@@ -238,83 +238,83 @@ bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
return isInLastWarp();
}
-bool mapping::isMainThreadInGenericMode() {
+OMP_ATTRS bool mapping::isMainThreadInGenericMode() {
return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
}
-bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
+OMP_ATTRS bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
if (IsSPMD)
return mapping::getThreadIdInBlock() == 0;
return isInLastWarp();
}
-bool mapping::isLeaderInWarp() {
+OMP_ATTRS bool mapping::isLeaderInWarp() {
__kmpc_impl_lanemask_t Active = mapping::activemask();
__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
return utils::popc(Active & LaneMaskLT) == 0;
}
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+OMP_ATTRS LaneMaskTy mapping::activemask() { return impl::activemask(); }
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+OMP_ATTRS LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+OMP_ATTRS LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
-uint32_t mapping::getThreadIdInWarp() {
+OMP_ATTRS uint32_t mapping::getThreadIdInWarp() {
uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
return ThreadIdInWarp;
}
-uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
return ThreadIdInBlock;
}
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+OMP_ATTRS uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
-uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
+OMP_ATTRS uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
// If we are in SPMD mode, remove one warp.
return BlockSize - (!IsSPMD * impl::getWarpSize());
}
-uint32_t mapping::getMaxTeamThreads() {
+OMP_ATTRS uint32_t mapping::getMaxTeamThreads() {
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}
-uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
+OMP_ATTRS uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
return impl::getNumberOfThreadsInBlock(Dim);
}
-uint32_t mapping::getNumberOfThreadsInKernel() {
+OMP_ATTRS uint32_t mapping::getNumberOfThreadsInKernel() {
return impl::getNumberOfThreadsInKernel();
}
-uint32_t mapping::getWarpIdInBlock() {
+OMP_ATTRS uint32_t mapping::getWarpIdInBlock() {
uint32_t WarpID = impl::getWarpIdInBlock();
ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
return WarpID;
}
-uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
uint32_t BlockId = impl::getBlockIdInKernel(Dim);
ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
return BlockId;
}
-uint32_t mapping::getNumberOfWarpsInBlock() {
+OMP_ATTRS uint32_t mapping::getNumberOfWarpsInBlock() {
uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
return NumberOfWarpsInBlocks;
}
-uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
+OMP_ATTRS uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
return NumberOfBlocks;
}
-uint32_t mapping::getNumberOfProcessorElements() {
+OMP_ATTRS uint32_t mapping::getNumberOfProcessorElements() {
return static_cast<uint32_t>(config::getHardwareParallelism());
}
@@ -328,26 +328,27 @@ uint32_t mapping::getNumberOfProcessorElements() {
// the TU. We will need to solve this more correctly in the future.
[[gnu::weak]] int SHARED(IsSPMDMode);
-void mapping::init(bool IsSPMD) {
+OMP_ATTRS void mapping::init(bool IsSPMD) {
if (mapping::isInitialThreadInLevel0(IsSPMD))
IsSPMDMode = IsSPMD;
}
-bool mapping::isSPMDMode() { return IsSPMDMode; }
+OMP_ATTRS bool mapping::isSPMDMode() { return IsSPMDMode; }
-bool mapping::isGenericMode() { return !isSPMDMode(); }
+OMP_ATTRS bool mapping::isGenericMode() { return !isSPMDMode(); }
///}
extern "C" {
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
+[[gnu::noinline]] OMP_ATTRS uint32_t __kmpc_get_hardware_thread_id_in_block() {
return mapping::getThreadIdInBlock();
}
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
+[[gnu::noinline]] OMP_ATTRS uint32_t
+__kmpc_get_hardware_num_threads_in_block() {
return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
}
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
+[[gnu::noinline]] OMP_ATTRS uint32_t __kmpc_get_warp_size() {
return impl::getWarpSize();
}
}
@@ -361,26 +362,28 @@ _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
extern "C" {
-uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+OMP_ATTRS uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
return utils::ballotSync(mask, pred);
}
-int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
+OMP_ATTRS int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta,
+ int width) {
return utils::shuffleDown(mask, var, delta, width);
}
-float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
- int width) {
+OMP_ATTRS float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
+ int width) {
return utils::bitCast<float>(
utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
}
-long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
+OMP_ATTRS long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta,
+ int width) {
return utils::shuffleDown(mask, var, delta, width);
}
-double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
- int width) {
+OMP_ATTRS double ompx_shfl_down_sync_d(uint64_t mask, double var,
+ unsigned delta, int width) {
return utils::bitCast<double>(
utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
}
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index ba6fbf5d5c7e3c..8489e1e011e935 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -22,23 +22,23 @@
namespace ompx {
namespace impl {
-double getWTick();
+OMP_ATTRS double getWTick();
-double getWTime();
+OMP_ATTRS double getWTime();
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-double getWTick() {
+OMP_ATTRS double getWTick() {
// The number of ticks per second for the AMDGPU clock varies by card and can
// only be retrived by querying the driver. We rely on the device environment
// to inform us what the proper frequency is.
return 1.0 / config::getClockFrequency();
}
-double getWTime() {
+OMP_ATTRS double getWTime() {
return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
}
@@ -51,12 +51,12 @@ double getWTime() {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-double getWTick() {
+OMP_ATTRS double getWTick() {
// Timer precision is 1ns
return ((double)1E-9);
}
-double getWTime() {
+OMP_ATTRS double getWTime() {
uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
return static_cast<double>(nsecs) * getWTick();
}
@@ -66,7 +66,7 @@ double getWTime() {
/// Lookup a device-side function using a host pointer /p HstPtr using the table
/// provided by the device plugin. The table is an ordered pair of host and
/// device pointers sorted on the value of the host pointer.
-void *indirectCallLookup(void *HstPtr) {
+OMP_ATTRS void *indirectCallLookup(void *HstPtr) {
if (!HstPtr)
return nullptr;
@@ -111,7 +111,8 @@ void *indirectCallLookup(void *HstPtr) {
[[gnu::visibility("protected"), gnu::weak,
gnu::retain]] rpc::Client Client asm("__llvm_rpc_client");
#else
-[[gnu::visibility("protected"), gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
+[[gnu::visibility("protected"),
+ gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
#endif
} // namespace impl
@@ -122,19 +123,21 @@ void *indirectCallLookup(void *HstPtr) {
///{
extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+OMP_ATTRS int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) {
+ return 0;
+}
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+OMP_ATTRS int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-double omp_get_wtick(void) { return ompx::impl::getWTick(); }
+OMP_ATTRS double omp_get_wtick(void) { return ompx::impl::getWTick(); }
-double omp_get_wtime(void) { return ompx::impl::getWTime(); }
+OMP_ATTRS double omp_get_wtime(void) { return ompx::impl::getWTime(); }
-void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
+OMP_ATTRS void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
return ompx::impl::indirectCallLookup(HstPtr);
}
-void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
+OMP_ATTRS void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
switch (allocator) {
case omp_default_mem_alloc:
case omp_large_cap_mem_alloc:
@@ -147,7 +150,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
}
}
-void omp_free(void *ptr, omp_allocator_handle_t allocator) {
+OMP_ATTRS void omp_free(void *ptr, omp_allocator_handle_t allocator) {
switch (allocator) {
case omp_default_mem_alloc:
case omp_large_cap_mem_alloc:
@@ -161,7 +164,8 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
}
}
-unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
+OMP_ATTRS unsigned long long __llvm_omp_host_call(void *fn, void *data,
+ size_t size) {
rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>();
Port.send_n(data, size);
Port.send([=](rpc::Buffer *buffer, uint32_t) {
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 5286d53b623f0a..8473da4c5fa54d 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -46,7 +46,7 @@ using namespace ompx;
namespace {
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+OMP_ATTRS uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
uint32_t NThreadsICV =
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
uint32_t NumThreads = mapping::getMaxTeamThreads();
@@ -68,9 +68,10 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
}
// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
- int32_t bound_tid, void *fn,
- void **args, int64_t nargs) {
+[[clang::always_inline]] OMP_ATTRS void invokeMicrotask(int32_t global_tid,
+ int32_t bound_tid,
+ void *fn, void **args,
+ int64_t nargs) {
switch (nargs) {
#include "generated_microtask_cases.gen"
default:
@@ -83,10 +84,9 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
extern "C" {
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
- int32_t num_threads,
- void *fn, void **args,
- const int64_t nargs) {
+[[clang::always_inline]] OMP_ATTRS void
+__kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, void *fn, void **args,
+ const int64_t nargs) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t NumThreads = determineNumberOfThreads(num_threads);
uint32_t PTeamSize =
@@ -141,7 +141,7 @@ extern "C" {
return;
}
-[[clang::always_inline]] void
+[[clang::always_inline]] OMP_ATTRS void
__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
int32_t num_threads, int proc_bind, void *fn,
void *wrapper_fn, void **args, int64_t nargs) {
@@ -278,7 +278,8 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
__kmpc_end_sharing_variables();
}
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+[[clang::noinline]] OMP_ATTRS bool
+__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
// Work function and arguments for L1 parallel region.
*WorkFn = state::ParallelRegionFn;
@@ -292,7 +293,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
return ThreadIsActive;
}
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
+[[clang::noinline]] OMP_ATTRS void __kmpc_kernel_end_parallel() {
// In case we have modified an ICV for this thread before a ThreadState was
// created. We drop it now to not contaminate the next parallel region.
ASSERT(!mapping::isSPMDMode(), nullptr);
@@ -301,14 +302,19 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
ASSERT(!mapping::isSPMDMode(), nullptr);
}
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+OMP_ATTRS uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
+ return omp_get_level();
+}
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+OMP_ATTRS int32_t __kmpc_global_thread_num(IdentTy *) {
+ return omp_get_thread_num();
+}
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
- int32_t thread_limit) {}
+OMP_ATTRS void __kmpc_push_num_teams(IdentTy *loc, int32_t tid,
+ int32_t num_teams, int32_t thread_limit) {}
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+OMP_ATTRS void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid,
+ int proc_bind) {}
}
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
index bb3caaadcc03dd..0a0be5a2f028da 100644
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ b/offload/DeviceRTL/src/Profiling.cpp
@@ -14,9 +14,9 @@ extern "C" {
// Provides empty implementations for certain functions in compiler-rt
// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
+OMP_ATTRS void __llvm_profile_register_function(void *Ptr) {}
+OMP_ATTRS void __llvm_profile_register_names_function(void *Ptr, long int I) {}
+OMP_ATTRS void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
}
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index d3b4528401953c..5ecefddfaa51fb 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -24,15 +24,17 @@ namespace {
#pragma omp begin declare target device_type(nohost)
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+OMP_ATTRS void gpu_regular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct) {
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
/*Offset = */ mask, /*AlgoVersion=*/0);
}
}
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
- uint32_t size, uint32_t tid) {
+OMP_ATTRS void gpu_irregular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct,
+ uint32_t size, uint32_t tid) {
uint32_t curr_size;
uint32_t mask;
curr_size = size;
@@ -44,8 +46,8 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
- ShuffleReductFnTy shflFct) {
+OMP_ATTRS static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
@@ -63,9 +65,9 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
return (logical_lane_id == 0);
}
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct) {
+OMP_ATTRS static int32_t
+nvptx_parallel_reduce_nowait(void *reduce_data, ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct) {
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
BlockThreadId = 0;
@@ -73,16 +75,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
if (NumThreads == 1)
return 1;
- //
- // This reduce function handles reduction within a team. It handles
- // parallel regions in both L1 and L2 parallelism levels. It also
- // supports Generic, SPMD, and NoOMP modes.
- //
- // 1. Reduce within a warp.
- // 2. Warp master copies value to warp 0 via shared memory.
- // 3. Warp 0 reduces to a single value.
- // 4. The reduced value is available in the thread that returns 1.
- //
+ //
+ // This reduce function handles reduction within a team. It handles
+ // parallel regions in both L1 and L2 parallelism levels. It also
+ // supports Generic, SPMD, and NoOMP modes.
+ //
+ // 1. Reduce within a warp.
+ // 2. Warp master copies value to warp 0 via shared memory.
+ // 3. Warp 0 reduces to a single value.
+ // 4. The reduced value is available in the thread that returns 1.
+ //
#if __has_builtin(__nvvm_reflect)
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
@@ -157,26 +159,24 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}
-uint32_t roundToWarpsize(uint32_t s) {
+OMP_ATTRS uint32_t roundToWarpsize(uint32_t s) {
if (s < mapping::getWarpSize())
return 1;
return (s & ~(unsigned)(mapping::getWarpSize() - 1));
}
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+OMP_ATTRS uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
} // namespace
extern "C" {
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
- uint64_t reduce_data_size,
- void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct) {
+OMP_ATTRS int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+ IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data,
+ ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+OMP_ATTRS int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
@@ -313,7 +313,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
}
}
-void *__kmpc_reduction_get_fixed_buffer() {
+OMP_ATTRS void *__kmpc_reduction_get_fixed_buffer() {
return state::getKernelLaunchEnvironment().ReductionBuffer;
}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 855c74fa58e0a5..baa8d7364d75ec 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -53,13 +53,15 @@ namespace {
extern "C" {
#ifdef __AMDGPU__
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+[[gnu::weak]] OMP_ATTRS void *malloc(size_t Size) {
+ return allocator::alloc(Size);
+}
+[[gnu::weak]] OMP_ATTRS void free(void *Ptr) { allocator::free(Ptr); }
#else
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
+[[gnu::weak, gnu::leaf]] OMP_ATTRS void *malloc(size_t Size);
+[[gnu::weak, gnu::leaf]] OMP_ATTRS void free(void *Ptr);
#endif
}
@@ -76,19 +78,19 @@ extern "C" {
///
struct SharedMemorySmartStackTy {
/// Initialize the stack. Must be called by all threads.
- void init(bool IsSPMD);
+ OMP_ATTRS void init(bool IsSPMD);
/// Allocate \p Bytes on the stack for the encountering thread. Each thread
/// can call this function.
- void *push(uint64_t Bytes);
+ OMP_ATTRS void *push(uint64_t Bytes);
/// Deallocate the last allocation made by the encountering thread and pointed
/// to by \p Ptr from the stack. Each thread can call this function.
- void pop(void *Ptr, uint64_t Bytes);
+ OMP_ATTRS void pop(void *Ptr, uint64_t Bytes);
private:
/// Compute the size of the storage space reserved for a thread.
- uint32_t computeThreadStorageTotal() {
+ OMP_ATTRS uint32_t computeThreadStorageTotal() {
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
allocator::ALIGNMENT);
@@ -96,7 +98,7 @@ struct SharedMemorySmartStackTy {
/// Return the top address of the warp data stack, that is the first address
/// this warp will allocate memory at next.
- void *getThreadDataTop(uint32_t TId) {
+ OMP_ATTRS void *getThreadDataTop(uint32_t TId) {
return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
}
@@ -113,11 +115,11 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
/// The allocation of a single shared memory scratchpad.
static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
+OMP_ATTRS void SharedMemorySmartStackTy::init(bool IsSPMD) {
Usage[mapping::getThreadIdInBlock()] = 0;
}
-void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
+OMP_ATTRS void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
// First align the number of requested bytes.
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
/// be passed in as an argument and the stack rewritten to support it.
@@ -148,7 +150,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
return GlobalMemory;
}
-void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
+OMP_ATTRS void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
if (utils::isSharedMemPtr(Ptr)) {
int TId = mapping::getThreadIdInBlock();
@@ -160,28 +162,29 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
} // namespace
-void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+OMP_ATTRS void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
-void *memory::allocShared(uint64_t Bytes, const char *Reason) {
+OMP_ATTRS void *memory::allocShared(uint64_t Bytes, const char *Reason) {
return SharedMemorySmartStack.push(Bytes);
}
-void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
+OMP_ATTRS void memory::freeShared(void *Ptr, uint64_t Bytes,
+ const char *Reason) {
SharedMemorySmartStack.pop(Ptr, Bytes);
}
-void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
+OMP_ATTRS void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
void *Ptr = malloc(Bytes);
if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
PRINT("nullptr returned by malloc!\n");
return Ptr;
}
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+OMP_ATTRS void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
///}
-bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
+OMP_ATTRS bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
(ActiveLevelVar == Other.ActiveLevelVar) &
(MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
@@ -189,7 +192,7 @@ bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
(RunSchedChunkVar == Other.RunSchedChunkVar);
}
-void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
+OMP_ATTRS void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
ASSERT(LevelVar == Other.LevelVar, nullptr);
ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
@@ -198,7 +201,7 @@ void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
}
-void state::TeamStateTy::init(bool IsSPMD) {
+OMP_ATTRS void state::TeamStateTy::init(bool IsSPMD) {
ICVState.NThreadsVar = 0;
ICVState.LevelVar = 0;
ICVState.ActiveLevelVar = 0;
@@ -211,13 +214,13 @@ void state::TeamStateTy::init(bool IsSPMD) {
ParallelRegionFnVar = nullptr;
}
-bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
+OMP_ATTRS bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
return (ICVState == Other.ICVState) &
(HasThreadState == Other.HasThreadState) &
(ParallelTeamSize == Other.ParallelTeamSize);
}
-void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
+OMP_ATTRS void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
ICVState.assertEqual(Other.ICVState);
ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
@@ -228,8 +231,8 @@ state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
namespace {
-int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
- int OutOfBoundsVal = -1) {
+OMP_ATTRS int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
+ int OutOfBoundsVal = -1) {
if (Level == 0)
return DefaultVal;
int LevelVar = omp_get_level();
@@ -243,8 +246,8 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+OMP_ATTRS void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
SharedMemorySmartStack.init(IsSPMD);
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
TeamState.init(IsSPMD);
@@ -254,15 +257,15 @@ void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
}
}
-KernelEnvironmentTy &state::getKernelEnvironment() {
+OMP_ATTRS KernelEnvironmentTy &state::getKernelEnvironment() {
return *KernelEnvironmentPtr;
}
-KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
+OMP_ATTRS KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
return *KernelLaunchEnvironmentPtr;
}
-void state::enterDataEnvironment(IdentTy *Ident) {
+OMP_ATTRS void state::enterDataEnvironment(IdentTy *Ident) {
ASSERT(config::mayUseThreadStates(),
"Thread state modified while explicitly disabled!");
if (!config::mayUseThreadStates())
@@ -291,7 +294,7 @@ void state::enterDataEnvironment(IdentTy *Ident) {
ThreadStates[TId] = NewThreadState;
}
-void state::exitDataEnvironment() {
+OMP_ATTRS void state::exitDataEnvironment() {
ASSERT(config::mayUseThreadStates(),
"Thread state modified while explicitly disabled!");
@@ -299,7 +302,7 @@ void state::exitDataEnvironment() {
resetStateForThread(TId);
}
-void state::resetStateForThread(uint32_t TId) {
+OMP_ATTRS void state::resetStateForThread(uint32_t TId) {
if (!config::mayUseThreadStates())
return;
if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
@@ -310,7 +313,7 @@ void state::resetStateForThread(uint32_t TId) {
ThreadStates[TId] = PreviousThreadState;
}
-void state::runAndCheckState(void(Func(void))) {
+OMP_ATTRS void state::runAndCheckState(void(Func(void))) {
TeamStateTy OldTeamState = TeamState;
OldTeamState.assertEqual(TeamState);
@@ -319,133 +322,146 @@ void state::runAndCheckState(void(Func(void))) {
OldTeamState.assertEqual(TeamState);
}
-void state::assumeInitialState(bool IsSPMD) {
+OMP_ATTRS void state::assumeInitialState(bool IsSPMD) {
TeamStateTy InitialTeamState;
InitialTeamState.init(IsSPMD);
InitialTeamState.assertEqual(TeamState);
ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
}
-int state::getEffectivePTeamSize() {
+OMP_ATTRS int state::getEffectivePTeamSize() {
int PTeamSize = state::ParallelTeamSize;
return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
}
extern "C" {
-void omp_set_dynamic(int V) {}
+OMP_ATTRS void omp_set_dynamic(int V) {}
-int omp_get_dynamic(void) { return 0; }
+OMP_ATTRS int omp_get_dynamic(void) { return 0; }
-void omp_set_num_threads(int V) { icv::NThreads = V; }
+OMP_ATTRS void omp_set_num_threads(int V) { icv::NThreads = V; }
-int omp_get_max_threads(void) {
+OMP_ATTRS int omp_get_max_threads(void) {
int NT = icv::NThreads;
return NT > 0 ? NT : mapping::getMaxTeamThreads();
}
-int omp_get_level(void) {
+OMP_ATTRS int omp_get_level(void) {
int LevelVar = icv::Level;
ASSERT(LevelVar >= 0, nullptr);
return LevelVar;
}
-int omp_get_active_level(void) { return !!icv::ActiveLevel; }
+OMP_ATTRS int omp_get_active_level(void) { return !!icv::ActiveLevel; }
-int omp_in_parallel(void) { return !!icv::ActiveLevel; }
+OMP_ATTRS int omp_in_parallel(void) { return !!icv::ActiveLevel; }
-void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
+OMP_ATTRS void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
*ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
*ChunkSize = state::RunSchedChunk;
}
-void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
+OMP_ATTRS void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
icv::RunSched = (int)ScheduleKind;
state::RunSchedChunk = ChunkSize;
}
-int omp_get_ancestor_thread_num(int Level) {
+OMP_ATTRS int omp_get_ancestor_thread_num(int Level) {
return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
}
-int omp_get_thread_num(void) {
+OMP_ATTRS int omp_get_thread_num(void) {
return omp_get_ancestor_thread_num(omp_get_level());
}
-int omp_get_team_size(int Level) {
+OMP_ATTRS int omp_get_team_size(int Level) {
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
}
-int omp_get_num_threads(void) {
+OMP_ATTRS int omp_get_num_threads(void) {
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
}
-int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
+OMP_ATTRS int omp_get_thread_limit(void) {
+ return mapping::getMaxTeamThreads();
+}
-int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
+OMP_ATTRS int omp_get_num_procs(void) {
+ return mapping::getNumberOfProcessorElements();
+}
-void omp_set_nested(int) {}
+OMP_ATTRS void omp_set_nested(int) {}
-int omp_get_nested(void) { return false; }
+OMP_ATTRS int omp_get_nested(void) { return false; }
-void omp_set_max_active_levels(int Levels) {
+OMP_ATTRS void omp_set_max_active_levels(int Levels) {
icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
}
-int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
+OMP_ATTRS int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
-omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
+OMP_ATTRS omp_proc_bind_t omp_get_proc_bind(void) {
+ return omp_proc_bind_false;
+}
-int omp_get_num_places(void) { return 0; }
+OMP_ATTRS int omp_get_num_places(void) { return 0; }
-int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
+OMP_ATTRS int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
-void omp_get_place_proc_ids(int, int *) {
+OMP_ATTRS void omp_get_place_proc_ids(int, int *) {
// TODO
}
-int omp_get_place_num(void) { return 0; }
+OMP_ATTRS int omp_get_place_num(void) { return 0; }
-int omp_get_partition_num_places(void) { return 0; }
+OMP_ATTRS int omp_get_partition_num_places(void) { return 0; }
-void omp_get_partition_place_nums(int *) {
+OMP_ATTRS void omp_get_partition_place_nums(int *) {
// TODO
}
-int omp_get_cancellation(void) { return 0; }
+OMP_ATTRS int omp_get_cancellation(void) { return 0; }
-void omp_set_default_device(int) {}
+OMP_ATTRS void omp_set_default_device(int) {}
-int omp_get_default_device(void) { return -1; }
+OMP_ATTRS int omp_get_default_device(void) { return -1; }
-int omp_get_num_devices(void) { return config::getNumDevices(); }
+OMP_ATTRS int omp_get_num_devices(void) { return config::getNumDevices(); }
-int omp_get_device_num(void) { return config::getDeviceNum(); }
+OMP_ATTRS int omp_get_device_num(void) { return config::getDeviceNum(); }
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
+OMP_ATTRS int omp_get_num_teams(void) {
+ return mapping::getNumberOfBlocksInKernel();
+}
-int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
+OMP_ATTRS int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
-int omp_get_initial_device(void) { return -1; }
+OMP_ATTRS int omp_get_initial_device(void) { return -1; }
-int omp_is_initial_device(void) { return 0; }
+OMP_ATTRS int omp_is_initial_device(void) { return 0; }
}
extern "C" {
-[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
+[[clang::noinline]] OMP_ATTRS void *__kmpc_alloc_shared(uint64_t Bytes) {
return memory::allocShared(Bytes, "Frontend alloc shared");
}
-[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+[[clang::noinline]] OMP_ATTRS void __kmpc_free_shared(void *Ptr,
+ uint64_t Bytes) {
memory::freeShared(Ptr, Bytes, "Frontend free shared");
}
-void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
+OMP_ATTRS void *__kmpc_get_dynamic_shared() {
+ return memory::getDynamicBuffer();
+}
-void *llvm_omp_target_dynamic_shared_alloc() {
+OMP_ATTRS void *llvm_omp_target_dynamic_shared_alloc() {
return __kmpc_get_dynamic_shared();
}
-void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+OMP_ATTRS void *llvm_omp_get_dynamic_shared() {
+ return __kmpc_get_dynamic_shared();
+}
/// Allocate storage in shared memory to communicate arguments from the main
/// thread to the workers in generic mode. If we exceed
@@ -460,7 +476,8 @@ constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
allocator(omp_pteam_mem_alloc)
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
+OMP_ATTRS void __kmpc_begin_sharing_variables(void ***GlobalArgs,
+ uint64_t nArgs) {
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
} else {
@@ -472,12 +489,12 @@ void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}
-void __kmpc_end_sharing_variables() {
+OMP_ATTRS void __kmpc_end_sharing_variables() {
if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
}
-void __kmpc_get_shared_variables(void ***GlobalArgs) {
+OMP_ATTRS void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}
}
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 72a97ae3fcfb42..f054a8add13138 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -29,36 +29,41 @@ namespace impl {
///
///{
/// NOTE: This function needs to be implemented by every target.
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope);
+OMP_ATTRS uint32_t atomicInc(uint32_t *Address, uint32_t Val,
+ atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope);
///}
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope);
-void namedBarrierInit();
-void namedBarrier();
-void fenceTeam(atomic::OrderingTy Ordering);
-void fenceKernel(atomic::OrderingTy Ordering);
-void fenceSystem(atomic::OrderingTy Ordering);
-void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-void unsetLock(omp_lock_t *);
-int testLock(omp_lock_t *);
-void initLock(omp_lock_t *);
-void destroyLock(omp_lock_t *);
-void setLock(omp_lock_t *);
-void unsetCriticalLock(omp_lock_t *);
-void setCriticalLock(omp_lock_t *);
+OMP_ATTRS uint32_t atomicInc(uint32_t *A, uint32_t V,
+ atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope);
+OMP_ATTRS void namedBarrierInit();
+OMP_ATTRS void namedBarrier();
+OMP_ATTRS void fenceTeam(atomic::OrderingTy Ordering);
+OMP_ATTRS void fenceKernel(atomic::OrderingTy Ordering);
+OMP_ATTRS void fenceSystem(atomic::OrderingTy Ordering);
+OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t);
+OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering);
+OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) {
+ syncThreads(Ordering);
+}
+OMP_ATTRS void unsetLock(omp_lock_t *);
+OMP_ATTRS int testLock(omp_lock_t *);
+OMP_ATTRS void initLock(omp_lock_t *);
+OMP_ATTRS void destroyLock(omp_lock_t *);
+OMP_ATTRS void setLock(omp_lock_t *);
+OMP_ATTRS void unsetCriticalLock(omp_lock_t *);
+OMP_ATTRS void setCriticalLock(omp_lock_t *);
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
+OMP_ATTRS uint32_t atomicInc(uint32_t *A, uint32_t V,
+ atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
// builtin_amdgcn_atomic_inc32 should expand to this switch when
// passed a runtime value, but does not do so yet. Workaround here.
@@ -91,12 +96,12 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
uint32_t SHARED(namedBarrierTracker);
-void namedBarrierInit() {
+OMP_ATTRS void namedBarrierInit() {
// Don't have global ctors, and shared memory is not zero init
atomic::store(&namedBarrierTracker, 0u, atomic::release);
}
-void namedBarrier() {
+OMP_ATTRS void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();
// assert(NumThreads % 32 == 0);
@@ -143,25 +148,25 @@ void namedBarrier() {
fence::team(atomic::release);
}
-void fenceTeam(atomic::OrderingTy Ordering) {
+OMP_ATTRS void fenceTeam(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
}
-void fenceKernel(atomic::OrderingTy Ordering) {
+OMP_ATTRS void fenceKernel(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::device_);
}
-void fenceSystem(atomic::OrderingTy Ordering) {
+OMP_ATTRS void fenceSystem(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::system);
}
-void syncWarp(__kmpc_impl_lanemask_t) {
+OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t) {
// This is a no-op on current AMDGPU hardware but it is used by the optimizer
// to enforce convergent behaviour between control flow graphs.
__builtin_amdgcn_wave_barrier();
}
-void syncThreads(atomic::OrderingTy Ordering) {
+OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering) {
if (Ordering != atomic::relaxed)
fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
@@ -170,23 +175,25 @@ void syncThreads(atomic::OrderingTy Ordering) {
if (Ordering != atomic::relaxed)
fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) {
+ syncThreads(Ordering);
+}
// TODO: Don't have wavefront lane locks. Possibly can't have them.
-void unsetLock(omp_lock_t *) { __builtin_trap(); }
-int testLock(omp_lock_t *) { __builtin_trap(); }
-void initLock(omp_lock_t *) { __builtin_trap(); }
-void destroyLock(omp_lock_t *) { __builtin_trap(); }
-void setLock(omp_lock_t *) { __builtin_trap(); }
+OMP_ATTRS void unsetLock(omp_lock_t *) { __builtin_trap(); }
+OMP_ATTRS int testLock(omp_lock_t *) { __builtin_trap(); }
+OMP_ATTRS void initLock(omp_lock_t *) { __builtin_trap(); }
+OMP_ATTRS void destroyLock(omp_lock_t *) { __builtin_trap(); }
+OMP_ATTRS void setLock(omp_lock_t *) { __builtin_trap(); }
constexpr uint32_t UNSET = 0;
constexpr uint32_t SET = 1;
-void unsetCriticalLock(omp_lock_t *Lock) {
+OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) {
(void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
}
-void setCriticalLock(omp_lock_t *Lock) {
+OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) {
uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
if (mapping::getThreadIdInWarp() == LowestActiveThread) {
fenceKernel(atomic::release);
@@ -208,14 +215,15 @@ void setCriticalLock(omp_lock_t *Lock) {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
+OMP_ATTRS uint32_t atomicInc(uint32_t *Address, uint32_t Val,
+ atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
return __nvvm_atom_inc_gen_ui(Address, Val);
}
-void namedBarrierInit() {}
+OMP_ATTRS void namedBarrierInit() {}
-void namedBarrier() {
+OMP_ATTRS void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();
ASSERT(NumThreads % 32 == 0, nullptr);
@@ -225,20 +233,24 @@ void namedBarrier() {
__nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
}
-void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
+OMP_ATTRS void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
-void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
+OMP_ATTRS void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
-void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
+OMP_ATTRS void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
-void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
+OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t Mask) {
+ __nvvm_bar_warp_sync(Mask);
+}
-void syncThreads(atomic::OrderingTy Ordering) {
+OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
__nvvm_barrier_sync(BarrierNo);
}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
+OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) {
+ __syncthreads();
+}
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
@@ -247,19 +259,19 @@ constexpr uint32_t SET = 1;
// TODO: This seems to hide a bug in the declare variant handling. If it is
// called before it is defined
// here the overload won't happen. Investigate lalter!
-void unsetLock(omp_lock_t *Lock) {
+OMP_ATTRS void unsetLock(omp_lock_t *Lock) {
(void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
}
-int testLock(omp_lock_t *Lock) {
+OMP_ATTRS int testLock(omp_lock_t *Lock) {
return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst);
}
-void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+OMP_ATTRS void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
+OMP_ATTRS void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
-void setLock(omp_lock_t *Lock) {
+OMP_ATTRS void setLock(omp_lock_t *Lock) {
// TODO: not sure spinning is a good idea here..
while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
atomic::seq_cst) != UNSET) {
@@ -275,56 +287,63 @@ void setLock(omp_lock_t *Lock) {
} // wait for 0 to be the read value
}
-void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
+OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
-void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
+OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
#pragma omp end declare variant
///}
} // namespace impl
-void synchronize::init(bool IsSPMD) {
+OMP_ATTRS void synchronize::init(bool IsSPMD) {
if (!IsSPMD)
impl::namedBarrierInit();
}
-void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
+OMP_ATTRS void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-void synchronize::threads(atomic::OrderingTy Ordering) {
+OMP_ATTRS void synchronize::threads(atomic::OrderingTy Ordering) {
impl::syncThreads(Ordering);
}
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
+OMP_ATTRS void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
impl::syncThreadsAligned(Ordering);
}
-void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
+OMP_ATTRS void fence::team(atomic::OrderingTy Ordering) {
+ impl::fenceTeam(Ordering);
+}
-void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
+OMP_ATTRS void fence::kernel(atomic::OrderingTy Ordering) {
+ impl::fenceKernel(Ordering);
+}
-void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
+OMP_ATTRS void fence::system(atomic::OrderingTy Ordering) {
+ impl::fenceSystem(Ordering);
+}
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
+OMP_ATTRS uint32_t atomic::inc(uint32_t *Addr, uint32_t V,
+ atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
return impl::atomicInc(Addr, V, Ordering, MemScope);
}
-void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
+OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
+OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+OMP_ATTRS void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+OMP_ATTRS void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
-int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+OMP_ATTRS int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
__kmpc_barrier(Loc, TId);
return 0;
}
-void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+OMP_ATTRS void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
if (mapping::isMainThreadInGenericMode())
return __kmpc_flush(Loc);
@@ -334,66 +353,71 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
impl::namedBarrier();
}
-[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
+[[clang::noinline]] OMP_ATTRS void __kmpc_barrier_simple_spmd(IdentTy *Loc,
+ int32_t TId) {
synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
}
-[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
- int32_t TId) {
+[[clang::noinline]] OMP_ATTRS void __kmpc_barrier_simple_generic(IdentTy *Loc,
+ int32_t TId) {
synchronize::threads(atomic::OrderingTy::seq_cst);
}
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+OMP_ATTRS int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
return omp_get_thread_num() == 0;
}
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+OMP_ATTRS void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
+OMP_ATTRS int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
return omp_get_thread_num() == Filter;
}
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
+OMP_ATTRS void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+OMP_ATTRS int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
return __kmpc_master(Loc, TId);
}
-void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+OMP_ATTRS void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
// The barrier is explicitly called.
}
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
+OMP_ATTRS void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
+OMP_ATTRS uint64_t __kmpc_warp_active_thread_mask(void) {
+ return mapping::activemask();
+}
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
+OMP_ATTRS void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+OMP_ATTRS void __kmpc_critical(IdentTy *Loc, int32_t TId,
+ CriticalNameTy *Name) {
impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
}
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+OMP_ATTRS void __kmpc_end_critical(IdentTy *Loc, int32_t TId,
+ CriticalNameTy *Name) {
impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
}
-void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
+OMP_ATTRS void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
+OMP_ATTRS void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
-void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
+OMP_ATTRS void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
-void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
+OMP_ATTRS void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
+OMP_ATTRS int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
-void ompx_sync_block(int Ordering) {
+OMP_ATTRS void ompx_sync_block(int Ordering) {
impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
}
-void ompx_sync_block_acq_rel() {
+OMP_ATTRS void ompx_sync_block_acq_rel() {
impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
}
-void ompx_sync_block_divergent(int Ordering) {
+OMP_ATTRS void ompx_sync_block_divergent(int Ordering) {
impl::syncThreads(atomic::OrderingTy(Ordering));
}
} // extern "C"
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 23a967c1a337e2..6bcf461103dd74 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -24,10 +24,10 @@ using namespace ompx;
extern "C" {
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
- size_t TaskSizeInclPrivateValues,
- size_t SharedValuesSize,
- TaskFnTy TaskFn) {
+OMP_ATTRS TaskDescriptorTy *
+__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
+ size_t TaskSizeInclPrivateValues, size_t SharedValuesSize,
+ TaskFnTy TaskFn) {
auto TaskSizeInclPrivateValuesPadded =
utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
@@ -40,14 +40,14 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
return TaskDescriptor;
}
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
+OMP_ATTRS int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor) {
return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
}
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int32_t,
- void *, int32_t, void *) {
+OMP_ATTRS int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor,
+ int32_t, void *, int32_t, void *) {
state::DateEnvironmentRAII DERAII(Loc);
TaskDescriptor->TaskFn(0, TaskDescriptor);
@@ -56,33 +56,35 @@ int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
return 0;
}
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
+OMP_ATTRS void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor) {
state::enterDataEnvironment(Loc);
}
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
+OMP_ATTRS void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor) {
state::exitDataEnvironment();
memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
}
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
- void *) {}
+OMP_ATTRS void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *,
+ int32_t, void *) {}
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+OMP_ATTRS void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+OMP_ATTRS void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+OMP_ATTRS int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) {
+ return 0;
+}
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+OMP_ATTRS int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int,
- uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
- int32_t, uint64_t, void *) {
+OMP_ATTRS void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+ TaskDescriptorTy *TaskDescriptor, int,
+ uint64_t *LowerBound, uint64_t *UpperBound,
+ int64_t, int, int32_t, uint64_t, void *) {
// Skip task entirely if empty iteration space.
if (*LowerBound > *UpperBound)
return;
@@ -93,7 +95,7 @@ void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
__kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
}
-int omp_in_final(void) {
+OMP_ATTRS int omp_in_final(void) {
// treat all tasks as final... Specs may expect runtime to keep
// track more precisely if a task was actively set by users... This
// is not explicitly specified; will treat as if runtime can
@@ -101,7 +103,7 @@ int omp_in_final(void) {
return 1;
}
-int omp_get_max_task_priority(void) { return 0; }
+OMP_ATTRS int omp_get_max_task_priority(void) { return 0; }
}
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index ad60e66548be90..e7b5986b40d2e9 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -70,8 +70,9 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
*/
// helper function for static chunk
- static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
- T entityId, T numberOfEntities) {
+ OMP_ATTRS static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+ ST chunk, T entityId,
+ T numberOfEntities) {
// each thread executes multiple chunks all of the same size, except
// the last one
// distance between two successive chunks
@@ -90,8 +91,9 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
// Loop with static scheduling without chunk
// helper function for static no chunk
- static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
- T entityId, T numberOfEntities) {
+ OMP_ATTRS static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+ ST &chunk, T entityId,
+ T numberOfEntities) {
// No chunk size specified. Each thread or warp gets at most one
// chunk; chunks are all almost of equal size
T loopSize = ub - lb + 1;
@@ -115,9 +117,10 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
- static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
- T *plower, T *pupper, ST *pstride, ST chunk,
- bool IsSPMDExecutionMode) {
+ OMP_ATTRS static void for_static_init(int32_t, int32_t schedtype,
+ int32_t *plastiter, T *plower,
+ T *pupper, ST *pstride, ST chunk,
+ bool IsSPMDExecutionMode) {
int32_t gtid = omp_get_thread_num();
int numberOfActiveOMPThreads = omp_get_num_threads();
@@ -202,14 +205,14 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch Init
- static int OrderedSchedule(kmp_sched_t schedule) {
+ OMP_ATTRS static int OrderedSchedule(kmp_sched_t schedule) {
return schedule >= kmp_sched_ordered_first &&
schedule <= kmp_sched_ordered_last;
}
- static void dispatch_init(IdentTy *loc, int32_t threadId,
- kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
- DynamicScheduleTracker *DST) {
+ OMP_ATTRS static void dispatch_init(IdentTy *loc, int32_t threadId,
+ kmp_sched_t schedule, T lb, T ub, ST st,
+ ST chunk, DynamicScheduleTracker *DST) {
int tid = mapping::getThreadIdInBlock();
T tnum = omp_get_num_threads();
T tripCount = ub - lb + 1; // +1 because ub is inclusive
@@ -339,7 +342,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
- static uint64_t NextIter() {
+ OMP_ATTRS static uint64_t NextIter() {
__kmpc_impl_lanemask_t active = mapping::activemask();
uint32_t leader = utils::ffs(active) - 1;
uint32_t change = utils::popc(active);
@@ -353,8 +356,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
return warp_res + rank;
}
- static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
- T loopUpperBound) {
+ OMP_ATTRS static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+ T loopLowerBound, T loopUpperBound) {
T N = NextIter();
lb = loopLowerBound + N * chunkSize;
ub = lb + chunkSize - 1; // Clang uses i <= ub
@@ -379,9 +382,9 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
return FINISHED;
}
- static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
- T *plower, T *pupper, ST *pstride,
- DynamicScheduleTracker *DST) {
+ OMP_ATTRS static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
+ T *plower, T *pupper, ST *pstride,
+ DynamicScheduleTracker *DST) {
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
@@ -432,7 +435,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
return DISPATCH_NOTFINISHED;
}
- static void dispatch_fini() {
+ OMP_ATTRS static void dispatch_fini() {
// nothing
}
@@ -462,7 +465,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
static DynamicScheduleTracker **SHARED(ThreadDST);
// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
+OMP_ATTRS static DynamicScheduleTracker *pushDST() {
int32_t ThreadIndex = mapping::getThreadIdInBlock();
// Each block will allocate an array of pointers to DST structs. The array is
// equal in length to the number of threads in that block.
@@ -491,12 +494,12 @@ static DynamicScheduleTracker *pushDST() {
}
// Return the current DST.
-static DynamicScheduleTracker *peekDST() {
+OMP_ATTRS static DynamicScheduleTracker *peekDST() {
return ThreadDST[mapping::getThreadIdInBlock()];
}
// Pop the current DST and restore the last one.
-static void popDST() {
+OMP_ATTRS static void popDST() {
int32_t ThreadIndex = mapping::getThreadIdInBlock();
DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
@@ -513,7 +516,7 @@ static void popDST() {
synchronize::threads(atomic::seq_cst);
}
-void workshare::init(bool IsSPMD) {
+OMP_ATTRS void workshare::init(bool IsSPMD) {
if (mapping::isInitialThreadInLevel0(IsSPMD))
ThreadDST = nullptr;
}
@@ -521,168 +524,179 @@ void workshare::init(bool IsSPMD) {
extern "C" {
// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
- int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+OMP_ATTRS void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid,
+ int32_t schedule, int32_t lb, int32_t ub,
+ int32_t st, int32_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint32_t lb, uint32_t ub, int32_t st,
- int32_t chunk) {
+OMP_ATTRS void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid,
+ int32_t schedule, uint32_t lb,
+ uint32_t ub, int32_t st, int32_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
- int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+OMP_ATTRS void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid,
+ int32_t schedule, int64_t lb, int64_t ub,
+ int64_t st, int64_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint64_t lb, uint64_t ub, int64_t st,
- int64_t chunk) {
+OMP_ATTRS void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid,
+ int32_t schedule, uint64_t lb,
+ uint64_t ub, int64_t st, int64_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
- int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+OMP_ATTRS int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
+ int32_t *p_lb, int32_t *p_ub,
+ int32_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+OMP_ATTRS int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid,
+ int32_t *p_last, uint32_t *p_lb,
+ uint32_t *p_ub, int32_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
- int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+OMP_ATTRS int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
+ int64_t *p_lb, int64_t *p_ub,
+ int64_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+OMP_ATTRS int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid,
+ int32_t *p_last, uint64_t *p_lb,
+ uint64_t *p_ub, int64_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+OMP_ATTRS void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
}
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+OMP_ATTRS void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
}
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+OMP_ATTRS void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
}
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+OMP_ATTRS void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
}
// deinit
-void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
+OMP_ATTRS void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
////////////////////////////////////////////////////////////////////////////////
// KMP interface implementation (static loops)
////////////////////////////////////////////////////////////////////////////////
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
+OMP_ATTRS void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
+OMP_ATTRS void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
+OMP_ATTRS void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
+OMP_ATTRS void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
+OMP_ATTRS void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
+OMP_ATTRS void __kmpc_distribute_static_init_4u(
+ IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
+OMP_ATTRS void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
+OMP_ATTRS void __kmpc_distribute_static_init_8u(
+ IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
+OMP_ATTRS void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+OMP_ATTRS void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
+}
}
namespace ompx {
@@ -696,10 +710,11 @@ template <typename Ty> class StaticLoopChunker {
/// size equal to the number of threads in the block and a thread chunk size
/// equal to one. In contrast to the chunked version we can get away with a
/// single loop in this case
- static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumBlocks, Ty BId, Ty NumThreads,
- Ty TId, Ty NumIters,
- bool OneIterationPerThread) {
+ OMP_ATTRS static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *),
+ void *Arg, Ty NumBlocks,
+ Ty BId, Ty NumThreads, Ty TId,
+ Ty NumIters,
+ bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * NumThreads;
// Start index in the normalized space.
@@ -726,11 +741,12 @@ template <typename Ty> class StaticLoopChunker {
/// Generic loop nest that handles block and/or thread distribution in the
/// presence of user specified chunk sizes (for at least one of them).
- static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
- Ty BlockChunk, Ty NumBlocks, Ty BId,
- Ty ThreadChunk, Ty NumThreads, Ty TId,
- Ty NumIters,
- bool OneIterationPerThread) {
+ OMP_ATTRS static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *),
+ void *Arg, Ty BlockChunk,
+ Ty NumBlocks, Ty BId,
+ Ty ThreadChunk, Ty NumThreads,
+ Ty TId, Ty NumIters,
+ bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * BlockChunk;
// Start index in the chunked space.
@@ -768,8 +784,9 @@ template <typename Ty> class StaticLoopChunker {
public:
/// Worksharing `for`-loop.
- static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+ OMP_ATTRS static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+ void *Arg, Ty NumIters, Ty NumThreads,
+ Ty ThreadChunk) {
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(ThreadChunk >= 0, "Bad thread count");
@@ -807,8 +824,8 @@ template <typename Ty> class StaticLoopChunker {
}
/// Worksharing `distrbute`-loop.
- static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty BlockChunk) {
+ OMP_ATTRS static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+ void *Arg, Ty NumIters, Ty BlockChunk) {
ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -854,9 +871,10 @@ template <typename Ty> class StaticLoopChunker {
}
/// Worksharing `distrbute parallel for`-loop.
- static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
- void *Arg, Ty NumIters, Ty NumThreads,
- Ty BlockChunk, Ty ThreadChunk) {
+ OMP_ATTRS static void DistributeFor(IdentTy *Loc,
+ void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumIters, Ty NumThreads, Ty BlockChunk,
+ Ty ThreadChunk) {
ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
More information about the llvm-commits
mailing list