[Openmp-commits] [clang] [openmp] [OFFLOAD] Build DeviceRTL with SPIRV backend (PR #174675)

Tue Jan 6 16:55:07 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: None (fineg74)

<details>
<summary>Changes</summary>

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs

---

Patch is 22.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174675.diff


11 Files Affected:

- (modified) clang/lib/Headers/CMakeLists.txt (+1) 
- (modified) clang/lib/Headers/gpuintrin.h (+3-1) 
- (added) clang/lib/Headers/spirvintrin.h (+235) 
- (modified) openmp/device/CMakeLists.txt (+37-23) 
- (modified) openmp/device/include/DeviceTypes.h (+7-1) 
- (modified) openmp/device/include/LibC.h (+9) 
- (modified) openmp/device/include/State.h (+1-1) 
- (modified) openmp/device/src/Allocator.cpp (+1-1) 
- (modified) openmp/device/src/LibC.cpp (+4-1) 
- (modified) openmp/device/src/Parallelism.cpp (+5-5) 
- (modified) openmp/device/src/Synchronization.cpp (+73) 


``````````diff

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..8b75cc14878e3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -18,7 +18,7 @@
 #define __GPUINTRIN_H
 
 #if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__HIP__) || defined(__CUDA__) || defined(__SPIRV__)
 #define _DEFAULT_FN_ATTRS __attribute__((device))
 #else
 #define _DEFAULT_FN_ATTRS
@@ -62,6 +62,8 @@ _Pragma("omp end declare target");
 #include <amdgpuintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..e66a2bf0767a6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,235 @@
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIRV targets or offloading to SPIRV"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+#include <stdint.h>
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#define true 1
+#define false 0
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+//
+#define __gpu_private  __attribute__((address_space(0)))
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((spirv_kernel, visibility("protected")))
+#define __SPIRV_VAR_QUALIFIERS extern const
+// Workgroup and invocation ID functions
+uint64_t __spirv_BuiltInNumWorkgroups(int i);
+uint64_t __spirv_BuiltInWorkgroupId(int i);
+uint64_t __spirv_BuiltInWorkgroupSize(int i);
+uint64_t __spirv_BuiltInLocalInvocationId(int i);
+
+typedef enum {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4
+} Scope_t;
+
+typedef enum {
+  Relaxed = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10
+} MemorySemantics_t;
+
+using unsigned ProgramAS = 9;
+
+#ifdef __cplusplus
+template <typename... Args> int __spirv_ocl_printf(Args...);
+#endif
+
+// Subgroup
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
+
+// Group non-uniform operations
+uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope,
+                                       bool predicate);
+uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope,
+                                               uint32_t value);
+uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope,
+                                        uint32_t value, uint32_t id);
+
+// Synchronization
+void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope,
+                            uint32_t semantics);
+void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
+
+// Atomic
+uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
+void __spirv_AtomicStore(int32_t *, int, int, int);
+int32_t __spirv_AtomicLoad(int32_t *, int, int);
+int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
+
+
+// Returns the number of blocks in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __spirv_BuiltInNumWorkgroups(0);
+}
+
+// Returns the number of blocks in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __spirv_BuiltInNumWorkgroups(1);
+}
+
+// Returns the number of blocks in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __spirv_BuiltInNumWorkgroups(2);
+}
+
+// Returns the 'x' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __spirv_BuiltInWorkgroupId(0);
+}
+
+// Returns the 'y' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __spirv_BuiltInWorkgroupId(1);
+}
+
+// Returns the 'z' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __spirv_BuiltInWorkgroupId(2);
+}
+
+// Returns the number of threads in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __spirv_BuiltInWorkgroupSize(0);
+}
+
+// Returns the number of threads in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __spirv_BuiltInWorkgroupSize(1);
+}
+
+// Returns the number of threads in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __spirv_BuiltInWorkgroupSize(2);
+}
+
+// Returns the 'x' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __spirv_BuiltInLocalInvocationId(0);
+}
+
+// Returns the 'y' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __spirv_BuiltInLocalInvocationId(1);
+}
+
+// Returns the 'z' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __spirv_BuiltInLocalInvocationId(2);
+}
+
+// Returns the size of a warp, always 32 on NVIDIA hardware.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __spirv_BuiltInSubgroupSize;
+}
+
+// Returns the id of the thread inside of a warp executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __spirv_BuiltInSubgroupLocalInvocationId;
+}
+ 
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 
+  uint32_t Size = __gpu_num_lanes();
+  return ((uint64_t)1 << Size) - (uint64_t)1;
+}
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __spirv_GroupNonUniformBroadcastFirst(3, __x);
+}
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  uint64_t ballot = __spirv_GroupNonUniformBallot(3, __x);
+  return __lane_mask & ballot;
+}
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+                          0x100 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+                          0x80 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __spirv_GroupNonUniformShuffle(3, __x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Returns true if the flat pointer points to 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return false; // TODO
+}
+// Returns true if the flat pointer points to 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return false; // TODO
+}
+// Terminates execution of the calling thread.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+}
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+#endif // __SPIRVINTRIN_H
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..0dc43ac034225 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -25,14 +25,18 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
 )
 
-list(APPEND compile_options -flto)
-list(APPEND compile_options -fvisibility=hidden)
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+   NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  list(APPEND compile_options -flto)
+  list(APPEND compile_options -fvisibility=hidden)
+  list(APPEND compile_options -Wno-unknown-cuda-version)
+endif()
 list(APPEND compile_options -nogpulib)
 list(APPEND compile_options -nostdlibinc)
 list(APPEND compile_options -fno-rtti)
 list(APPEND compile_options -fno-exceptions)
 list(APPEND compile_options -fconvergent-functions)
-list(APPEND compile_options -Wno-unknown-cuda-version)
+
 if(LLVM_DEFAULT_TARGET_TRIPLE)
   list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
 endif()
@@ -43,7 +47,7 @@ endif()
 # instructions yet and we end up missing out on way more important constant
 # propagation. That said, we will run the vectorizer again after the runtime
 # has been linked into the user program.
-list(APPEND compile_options "SHELL: -mllvm -vectorize-slp=false")
+list(APPEND compile_options -mllvm -vectorize-slp=false)
 if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
    "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
   set(target_name "amdgpu")
@@ -52,6 +56,10 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
        "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
   set(target_name "nvptx")
   list(APPEND compile_options --cuda-feature=+ptx63)
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
+       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
+  set(target_name "spirv") 
+  list(APPEND compile_options -emit-llvm -c)
 endif()
 
 # Trick to combine these into a bitcode file via the linker's LTO pass.
@@ -62,26 +70,32 @@ set_target_properties(libompdevice PROPERTIES
   BUILD_RPATH ""
   INSTALL_RPATH ""
   RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(TARGET libc)
-  target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
-endif()
-target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-
-target_include_directories(libompdevice PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR}/include
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
-target_compile_options(libompdevice PRIVATE ${compile_options})
-target_link_options(libompdevice PRIVATE
+ 
+  # If the user built with the GPU C library enabled we will use that instead.
+  if(TARGET libc)
+    target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+  endif()
+  target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+  
+  target_include_directories(libompdevice PRIVATE
+                             ${CMAKE_CURRENT_SOURCE_DIR}/include
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+  target_compile_options(libompdevice PRIVATE ${compile_options})
+  if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+     NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")  
+    target_link_options(libompdevice PRIVATE
                     "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
-if(LLVM_DEFAULT_TARGET_TRIPLE)
-  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
-endif()
-install(TARGETS libompdevice
-        PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-        DESTINATION ${OPENMP_INSTALL_LIBDIR})
+  else()
+    target_link_options(libompdevice PRIVATE
+                    "-nostdlib" "-emit-llvm" "-Wl")
+  endif()
+  if(LLVM_DEFAULT_TARGET_TRIPLE)
+    target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+  endif()
+  install(TARGETS libompdevice
+          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+          DESTINATION ${OPENMP_INSTALL_LIBDIR})
 
 add_library(ompdevice.all_objs OBJECT IMPORTED)
 set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 213ccfe58b4fb..2c68109ca544d 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -131,7 +131,13 @@ struct IdentTy {
 
 using __kmpc_impl_lanemask_t = LaneMaskTy;
 
-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+using FnPtrTy = __attribute__((address_space(ProgramAS))) void *;
+#else
+using FnPtrTy = void *;
+#endif
+
+using ParallelRegionFnTy = FnPtrTy;
 
 using CriticalNameTy = int32_t[8];
 
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 94b5e65196067..8881cf46176fd 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,7 +16,16 @@
 
 namespace ompx {
 
+// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
+// This is to provide a workaround to use regular printf that is used in the code.
+#if defined(__SPIRV__)
+template <size_t N, typename... Args>
+int printf(const char (&Format)[N], Args... args) {
+  return __spirv_ocl_printf(Format, args...);
+}
+#else
 int printf(const char *Format, ...);
+#endif
 
 } // namespace ompx
 
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index cd6013780a49c..31dc1540d7dd4 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   __builtin_unreachable();
 }
 
-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_ParallelRegionFn:
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index 34c945c979ffb..3782478932046 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -23,7 +23,7 @@ using namespace allocator;
 // Provide a default implementation of malloc / free for AMDGPU platforms built
 // without 'libc' support.
 extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
 #else
diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp
index 83f9233d94803..6934387952b7c 100644
--- a/openmp/device/src/LibC.cpp
+++ b/openmp/device/src/LibC.cpp
@@ -31,14 +31,16 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
-
+#if !defined(__SPIRV__)
 [[gnu::weak]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
   __builtin_va_start(vlist, Format);
   return ::vprintf(Format, vlist);
 }
+#endif
 }
 
+#if !defined(__SPIRV__)
 namespace ompx {
 [[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
@@ -46,3 +48,4 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
+#endif
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index bd2c0799ee9f0..9f74990ce43ea 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
+                                              int32_t bound_tid, FnPtrTy fn,
                                               void **args, int64_t nargs) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
@@ -84,7 +84,7 @@ extern "C" {
 
 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
                                                    int32_t num_threads,
-                                                   void *fn, void **args,
+                                                   FnPtrTy fn, void **args,
                                                    const int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
@@ -142,8 +142,8 @@ extern "C" {
 
 [[clang::always_inline]] void
 __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs,
+                   int32_t num_threads, int proc_bind, FnPtrTy fn,
+                   FnPtrTy wrapper_fn, void **args, int64_t nargs,
                    int32_t nt_strict) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
@@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
+                                          (FnPtrTy) nullptr, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                      /*ForceTeamState=*/true);
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 501dc4a291ed1..09edb8dc2d9cc 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -258,6 +258,79 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 #endif
 ///}
 
+#if defined(__SPIRV__)
+
+MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case atomic::relaxed:
+    return MemorySemantics_t::Relaxed;
+  case atomic::acquire:
+    return MemorySemantics_t::Acquire;
...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/174675