[Openmp-commits] [clang] [openmp] [OFFLOAD] Build DeviceRTL with SPIRV backend (PR #174675)

Fri Jan 9 09:58:02 PST 2026

https://github.com/fineg74 updated https://github.com/llvm/llvm-project/pull/174675

>From 996c2e16dd7317cb1be936f9459145f3b0a7070c Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Mon, 24 Nov 2025 16:30:47 -0800
Subject: [PATCH 01/11] Build DeviceRTL with spirv backend

---
 clang/lib/Headers/CMakeLists.txt      |   1 +
 clang/lib/Headers/gpuintrin.h         |   6 +-
 clang/lib/Headers/spirvintrin.h       | 207 ++++++++++++++++++++++++++
 openmp/device/CMakeLists.txt          | 105 ++++++++++---
 openmp/device/include/DeviceTypes.h   |   8 +-
 openmp/device/include/LibC.h          |   7 +
 openmp/device/include/State.h         |   2 +-
 openmp/device/src/Allocator.cpp       |   2 +-
 openmp/device/src/LibC.cpp            |   5 +-
 openmp/device/src/Parallelism.cpp     |  10 +-
 openmp/device/src/Synchronization.cpp |  96 ++++++++++++
 11 files changed, 414 insertions(+), 35 deletions(-)
 create mode 100644 clang/lib/Headers/spirvintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 33fff7645df65..208f8b9be6d60 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -295,6 +295,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..cc8a72bcfb0a3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -18,7 +18,7 @@
 #define __GPUINTRIN_H
 
 #if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__HIP__) || defined(__CUDA__) || defined(__SPIRV__)
 #define _DEFAULT_FN_ATTRS __attribute__((device))
 #else
 #define _DEFAULT_FN_ATTRS
@@ -56,7 +56,9 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
-#if defined(__NVPTX__)
+#if defined(__SPIRV__)
+#include <spirvintrin.h>
+#elif defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..84166a455d4db
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,207 @@
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIRV targets or offloading to SPIRV"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+#include <stdint.h>
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#define true 1
+#define false 0
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+//
+// TODO: FIX
+#define __gpu_private  
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((spirv_kernel, visibility("protected")))
+#define __SPIRV_VAR_QUALIFIERS extern const
+// Workgroup and invocation ID functions
+uint64_t __spirv_BuiltInNumWorkgroups(int i);
+uint64_t __spirv_BuiltInWorkgroupId(int i);
+uint64_t __spirv_BuiltInWorkgroupSize(int i);
+uint64_t __spirv_BuiltInLocalInvocationId(int i);
+
+#ifdef __cplusplus
+template <typename... Args>
+int __spirv_ocl_printf(Args...);
+#endif
+
+// Subgroup functions
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
+
+// Group non-uniform operations
+uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope, bool predicate);
+uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope, uint32_t value);
+uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope, uint32_t value, uint32_t id);
+
+// Synchronization
+void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope, uint32_t semantics);
+void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
+
+
+// Returns the number of blocks in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+   return __spirv_BuiltInNumWorkgroups(0);
+}
+
+// Returns the number of blocks in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+   return __spirv_BuiltInNumWorkgroups(1);
+}
+
+// Returns the number of blocks in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+   return __spirv_BuiltInNumWorkgroups(2);
+}
+
+// Returns the 'x' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __spirv_BuiltInWorkgroupId(0);
+}
+
+// Returns the 'y' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __spirv_BuiltInWorkgroupId(1);
+}
+
+// Returns the 'z' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __spirv_BuiltInWorkgroupId(2);
+}
+
+// Returns the number of threads in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __spirv_BuiltInWorkgroupSize(0);
+}
+
+// Returns the number of threads in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __spirv_BuiltInWorkgroupSize(1);
+}
+
+// Returns the number of threads in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __spirv_BuiltInWorkgroupSize(2);
+}
+
+// Returns the 'x' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __spirv_BuiltInLocalInvocationId(0);
+}
+
+// Returns the 'y' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __spirv_BuiltInLocalInvocationId(1);
+}
+
+// Returns the 'z' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __spirv_BuiltInLocalInvocationId(2);
+}
+
+// Returns the size of a warp, always 32 on NVIDIA hardware.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { return __spirv_BuiltInSubgroupSize; }
+
+// Returns the id of the thread inside of a warp executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { return __spirv_BuiltInSubgroupLocalInvocationId; }
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 
+ return __spirv_GroupNonUniformBallot(3, 1);
+}
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __spirv_GroupNonUniformBroadcastFirst(3, __x);
+}
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  uint64_t ballot = __spirv_GroupNonUniformBallot(3, __x);
+  return __lane_mask & ballot;
+}
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+   __spirv_ControlBarrier(4, 2, 0x8); // Workgroup scope, acquire/release semantics
+}
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+  __spirv_ControlBarrier(4, 3, 0x8); // Subgroup scope, acquire/release semantics
+}
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __spirv_GroupNonUniformShuffle(3, __x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Returns true if the flat pointer points to 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return false; // TODO
+  //return to_local(ptr) != 0;
+}
+// Returns true if the flat pointer points to 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return false;
+  //return to_private(ptr) != 0; // TODO
+}
+// Terminates execution of the calling thread.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  __builtin_unreachable();
+}
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  // SPIR-V doesn't have a direct equivalent, use a memory barrier as hint
+  __spirv_MemoryBarrier(1, 0x100);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+#endif // __SPIRVINTRIN_H
\ No newline at end of file
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..86f46de912584 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -25,14 +25,17 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
 )
 
-list(APPEND compile_options -flto)
-list(APPEND compile_options -fvisibility=hidden)
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+   NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  list(APPEND compile_options -flto)
+  list(APPEND compile_options -fvisibility=hidden)
+  list(APPEND compile_options -Wno-unknown-cuda-version)
+endif()
 list(APPEND compile_options -nogpulib)
 list(APPEND compile_options -nostdlibinc)
 list(APPEND compile_options -fno-rtti)
 list(APPEND compile_options -fno-exceptions)
 list(APPEND compile_options -fconvergent-functions)
-list(APPEND compile_options -Wno-unknown-cuda-version)
 if(LLVM_DEFAULT_TARGET_TRIPLE)
   list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
 endif()
@@ -52,37 +55,91 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
        "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
   set(target_name "nvptx")
   list(APPEND compile_options --cuda-feature=+ptx63)
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
+       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
+  set(target_name "spirv") 
+  list(APPEND compile_options -emit-llvm -c)
 endif()
 
-# Trick to combine these into a bitcode file via the linker's LTO pass.
-add_executable(libompdevice ${src_files})
-set_target_properties(libompdevice PROPERTIES
-  RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
-  LINKER_LANGUAGE CXX
-  BUILD_RPATH ""
-  INSTALL_RPATH ""
-  RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
+# Check if we're building for SPIRV
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" OR
+   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  # For SPIRV targets, compile each source file to bitcode individually
+  set(bc_files "")
+  foreach(src_file ${src_files})
+    get_filename_component(basename ${src_file} NAME_WE)
+    set(bc_file "${CMAKE_CURRENT_BINARY_DIR}/${basename}.bc")
 
-# If the user built with the GPU C library enabled we will use that instead.
-if(TARGET libc)
-  target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
-endif()
-target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+    add_custom_command(
+      OUTPUT ${bc_file}
+      COMMAND ${CMAKE_CXX_COMPILER}
+      ARGS ${compile_options}
+           -I${CMAKE_CURRENT_SOURCE_DIR}/include
+           -I${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+           -I${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include
+           $<$<BOOL:${LIBOMPTARGET_GPU_LIBC_SUPPORT}>:-DOMPTARGET_HAS_LIBC>
+           -DSHARED_SCRATCHPAD_SIZE=512
+           -o ${bc_file}
+           ${src_file}
+      DEPENDS ${src_file}
+      COMMENT "Compiling ${src_file} to bitcode"
+    )
+    list(APPEND bc_files ${bc_file})
+  endforeach()
+
+  # Find llvm-link
+  find_program(LLVM_LINK llvm-link HINTS ${LLVM_TOOLS_BINARY_DIR})
+  if(NOT LLVM_LINK)
+    message(FATAL_ERROR "llvm-link not found")
+  endif()
+
+  # Use llvm-link to combine all bitcode files
+  set(output_bc "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc")
+  add_custom_command(
+    OUTPUT ${output_bc}
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
+    COMMAND ${LLVM_LINK} ${bc_files} -o ${output_bc}
+    DEPENDS ${bc_files}
+    COMMENT "Linking bitcode files with llvm-link"
+  )
+
+  # Create a target for the linked bitcode
+  add_custom_target(libompdevice ALL DEPENDS ${output_bc})
 
-target_include_directories(libompdevice PRIVATE
+  # Install the bitcode file
+  install(FILES ${output_bc}
+          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+          DESTINATION ${OPENMP_INSTALL_LIBDIR})
+else()   
+  # Trick to combine these into a bitcode file via the linker's LTO pass.
+  add_executable(libompdevice ${src_files})
+  set_target_properties(libompdevice PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
+    LINKER_LANGUAGE CXX
+    BUILD_RPATH ""
+    INSTALL_RPATH ""
+    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
+
+  # If the user built with the GPU C library enabled we will use that instead.
+  if(TARGET libc)
+    target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+  endif()
+  target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+
+  target_include_directories(libompdevice PRIVATE
                            ${CMAKE_CURRENT_SOURCE_DIR}/include
                            ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
                            ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
-target_compile_options(libompdevice PRIVATE ${compile_options})
-target_link_options(libompdevice PRIVATE
+  target_compile_options(libompdevice PRIVATE ${compile_options})
+  target_link_options(libompdevice PRIVATE
                     "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
-if(LLVM_DEFAULT_TARGET_TRIPLE)
-  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
-endif()
-install(TARGETS libompdevice
+  if(LLVM_DEFAULT_TARGET_TRIPLE)
+    target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+  endif()
+  install(TARGETS libompdevice
         PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
         DESTINATION ${OPENMP_INSTALL_LIBDIR})
-
+endif()
 add_library(ompdevice.all_objs OBJECT IMPORTED)
 set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
              ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc)
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 2e5d92380f040..3a3cb46c3b7d9 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -128,7 +128,13 @@ struct IdentTy {
 
 using __kmpc_impl_lanemask_t = LaneMaskTy;
 
-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+using FnPtrTy = __attribute__((address_space(9))) void *;
+#else
+using FnPtrTy = void *;
+#endif
+
+using ParallelRegionFnTy = FnPtrTy;
 
 using CriticalNameTy = int32_t[8];
 
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 94b5e65196067..a67323b58f381 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,7 +16,14 @@
 
 namespace ompx {
 
+#if defined(__SPIRV__)
+template <size_t N, typename... Args>
+int printf(const char (&Format)[N], Args... args) {
+  return __spirv_ocl_printf(Format, args...);
+}
+#else    
 int printf(const char *Format, ...);
+#endif
 
 } // namespace ompx
 
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index cd6013780a49c..338f5a7f8d591 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   __builtin_unreachable();
 }
 
-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy&
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_ParallelRegionFn:
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index 34c945c979ffb..3782478932046 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -23,7 +23,7 @@ using namespace allocator;
 // Provide a default implementation of malloc / free for AMDGPU platforms built
 // without 'libc' support.
 extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
 #else
diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp
index 83f9233d94803..095d9944531fe 100644
--- a/openmp/device/src/LibC.cpp
+++ b/openmp/device/src/LibC.cpp
@@ -31,14 +31,16 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
-
+#if !defined(__SPIRV__)
 [[gnu::weak]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
   __builtin_va_start(vlist, Format);
   return ::vprintf(Format, vlist);
 }
+#endif
 }
 
+#if !defined(__SPIRV__)
 namespace ompx {
 [[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
@@ -46,3 +48,4 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
+#endif
\ No newline at end of file
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index 08ce616aee1c4..1d18bddb89eea 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
+                                              int32_t bound_tid, FnPtrTy fn,
                                               void **args, int64_t nargs) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
@@ -84,7 +84,7 @@ extern "C" {
 
 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
                                                    int32_t num_threads,
-                                                   void *fn, void **args,
+                                                   FnPtrTy fn, void **args,
                                                    const int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
@@ -142,8 +142,8 @@ extern "C" {
 
 [[clang::always_inline]] void
 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs) {
+                   int32_t num_threads, int proc_bind, FnPtrTy fn,
+                   FnPtrTy wrapper_fn, void **args, int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
   // Assert the parallelism level is zero if disabled by the user.
@@ -260,7 +260,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
+                                          (FnPtrTy)nullptr, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                      /*ForceTeamState=*/true);
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 501dc4a291ed1..385b47e9bf5dd 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -258,6 +258,102 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 #endif
 ///}
 
+#if defined(__SPIRV__)
+typedef enum {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4
+} Scope_t;
+typedef enum {
+  Relaxed = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10
+} MemorySemantics_t;
+
+extern "C" uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
+extern "C" void __spirv_MemoryBarrier(int, int);
+extern "C" void __spirv_ControlBarrier(uint32_t, uint32_t, uint32_t);
+
+MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case atomic::relaxed:
+    return MemorySemantics_t::Relaxed;
+  case atomic::acquire:
+    return MemorySemantics_t::Acquire;
+  case atomic::release:
+    return MemorySemantics_t::Release;
+  case atomic::acq_rel:
+    return MemorySemantics_t::AcquireRelease;
+  case atomic::seq_cst:
+    return MemorySemantics_t::SequentiallyConsistent;
+  }
+}
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
+                   atomic::MemScopeTy MemScope) {
+  return __spirv_AtomicIAdd(Address, (int)MemScope,
+                            convertOrderingType(Ordering), Val);
+}
+void namedBarrierInit() { __builtin_trap(); } // TODO
+void namedBarrier() { __builtin_trap(); }     // TODO
+void fenceTeam(atomic::OrderingTy Ordering) {
+  return __spirv_MemoryBarrier(Scope_t::Workgroup,
+                               convertOrderingType(Ordering));
+}
+void fenceKernel(atomic::OrderingTy Ordering) {
+  return __spirv_MemoryBarrier(Scope_t::Invocation,
+                               convertOrderingType(Ordering));
+}
+void fenceSystem(atomic::OrderingTy Ordering) {
+  return __spirv_MemoryBarrier(Scope_t::Device, convertOrderingType(Ordering));
+}
+
+void syncWarp(__kmpc_impl_lanemask_t) {
+  __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Subgroup,
+                         MemorySemantics_t::Acquire);
+}
+void syncThreads(atomic::OrderingTy Ordering) {
+  __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Workgroup,
+                         MemorySemantics_t::Acquire);
+}
+void unsetLock(omp_lock_t *Lock) {
+  atomic::store((int32_t *)Lock, 0, atomic::release);
+}
+int testLock(omp_lock_t *Lock) {
+  return atomic::add((int32_t *)Lock, 0, atomic::relaxed);
+}
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void setLock(omp_lock_t *Lock) {
+  int32_t *lock_ptr = (int32_t *)Lock;
+  bool acquired = false;
+  int32_t expected;
+  while (!acquired) {
+    expected = 0;
+    if (expected == atomic::load(lock_ptr, atomic::relaxed))
+      acquired =
+          atomic::cas(lock_ptr, expected, 1, atomic::acq_rel, atomic::release);
+  }
+}
+extern "C" int __attribute__((overloadable)) sub_group_scan_inclusive_min(int);
+void unsetCriticalLock(omp_lock_t *Lock) {
+  int id = mapping::getThreadIdInWarp();
+  if (id == sub_group_scan_inclusive_min(id))
+    unsetLock(Lock);
+}
+void setCriticalLock(omp_lock_t *Lock) {
+  int id = mapping::getThreadIdInWarp();
+  if (id == sub_group_scan_inclusive_min(id))
+    setLock(Lock);
+}
+void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+#endif
+
 } // namespace impl
 
 void synchronize::init(bool IsSPMD) {

>From 84def59b3fd01f6c04955309b03c2fee80483d02 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Tue, 6 Jan 2026 16:19:46 -0800
Subject: [PATCH 02/11] Address PR comments

---
 clang/lib/Headers/gpuintrin.h         |  6 +-
 clang/lib/Headers/spirvintrin.h       | 53 +++++++++++----
 openmp/device/CMakeLists.txt          | 93 +++++++--------------------
 openmp/device/include/DeviceTypes.h   |  2 +-
 openmp/device/include/LibC.h          |  2 +
 openmp/device/src/LibC.cpp            |  2 +-
 openmp/device/src/Synchronization.cpp | 67 +++++++------------
 7 files changed, 94 insertions(+), 131 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index cc8a72bcfb0a3..8b75cc14878e3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -56,14 +56,14 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
-#if defined(__SPIRV__)
-#include <spirvintrin.h>
-#elif defined(__NVPTX__)
+#if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index 84166a455d4db..6784e7ecc2794 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -22,8 +22,7 @@ _Pragma("omp begin declare variant match(device = {arch(spirv64)})");
 
 // Type aliases to the address spaces used by the SPIR-V backend.
 //
-// TODO: FIX
-#define __gpu_private  
+#define __gpu_private  __attribute__((address_space(0)))
 #define __gpu_constant
 #define __gpu_local
 #define __gpu_global __attribute__((address_space(1)))
@@ -37,12 +36,30 @@ uint64_t __spirv_BuiltInWorkgroupId(int i);
 uint64_t __spirv_BuiltInWorkgroupSize(int i);
 uint64_t __spirv_BuiltInLocalInvocationId(int i);
 
+typedef enum {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4
+} Scope_t;
+
+typedef enum {
+  Relaxed = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10
+} MemorySemantics_t;
+
+using unsigned ProgramAS = 9;
+
 #ifdef __cplusplus
 template <typename... Args>
 int __spirv_ocl_printf(Args...);
 #endif
 
-// Subgroup functions
+// Subgroup
 __SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
 __SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
 
@@ -55,6 +72,12 @@ uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope, uint32_t value
 void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope, uint32_t semantics);
 void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
 
+// Atomic
+uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
+void __spirv_AtomicStore(int32_t *, int, int, int);
+int32_t __spirv_AtomicLoad(int32_t *, int, int);
+int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
+
 
 // Returns the number of blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
@@ -117,14 +140,19 @@ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
 }
 
 // Returns the size of a warp, always 32 on NVIDIA hardware.
-_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { return __spirv_BuiltInSubgroupSize; }
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __spirv_BuiltInSubgroupSize;
+}
 
 // Returns the id of the thread inside of a warp executing together.
-_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { return __spirv_BuiltInSubgroupLocalInvocationId; }
-
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __spirv_BuiltInSubgroupLocalInvocationId;
+}
+ 
 // Returns the bit-mask of active threads in the current warp.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 
- return __spirv_GroupNonUniformBallot(3, 1);
+  uint32_t Size = __gpu_num_lanes();
+  return ((uint64_t)1 << Size) - (uint64_t)1;
 }
 // Copies the value from the first active thread in the warp to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
@@ -139,11 +167,13 @@ _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
 }
 // Waits for all the threads in the block to converge and issues a fence.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
-   __spirv_ControlBarrier(4, 2, 0x8); // Workgroup scope, acquire/release semantics
+   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup, 
+      0x100 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Waits for all threads in the warp to reconverge for independent scheduling.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
-  __spirv_ControlBarrier(4, 3, 0x8); // Subgroup scope, acquire/release semantics
+   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup, 
+      0x80 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
@@ -190,12 +220,9 @@ _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
 }
 // Terminates execution of the calling thread.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
-  __builtin_unreachable();
 }
 // Suspend the thread briefly to assist the scheduler during busy loops.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
-  // SPIR-V doesn't have a direct equivalent, use a memory barrier as hint
-  __spirv_MemoryBarrier(1, 0x100);
 }
 
 _Pragma("omp end declare variant");
@@ -204,4 +231,4 @@ _Pragma("omp end declare target");
 #if !defined(__cplusplus)
 _Pragma("pop_macro(\"bool\")");
 #endif
-#endif // __SPIRVINTRIN_H
\ No newline at end of file
+#endif // __SPIRVINTRIN_H
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 86f46de912584..0dc43ac034225 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -36,6 +36,7 @@ list(APPEND compile_options -nostdlibinc)
 list(APPEND compile_options -fno-rtti)
 list(APPEND compile_options -fno-exceptions)
 list(APPEND compile_options -fconvergent-functions)
+
 if(LLVM_DEFAULT_TARGET_TRIPLE)
   list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
 endif()
@@ -46,7 +47,7 @@ endif()
 # instructions yet and we end up missing out on way more important constant
 # propagation. That said, we will run the vectorizer again after the runtime
 # has been linked into the user program.
-list(APPEND compile_options "SHELL: -mllvm -vectorize-slp=false")
+list(APPEND compile_options -mllvm -vectorize-slp=false)
 if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
    "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
   set(target_name "amdgpu")
@@ -61,85 +62,41 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
   list(APPEND compile_options -emit-llvm -c)
 endif()
 
-# Check if we're building for SPIRV
-if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" OR
-   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
-  # For SPIRV targets, compile each source file to bitcode individually
-  set(bc_files "")
-  foreach(src_file ${src_files})
-    get_filename_component(basename ${src_file} NAME_WE)
-    set(bc_file "${CMAKE_CURRENT_BINARY_DIR}/${basename}.bc")
-
-    add_custom_command(
-      OUTPUT ${bc_file}
-      COMMAND ${CMAKE_CXX_COMPILER}
-      ARGS ${compile_options}
-           -I${CMAKE_CURRENT_SOURCE_DIR}/include
-           -I${CMAKE_CURRENT_SOURCE_DIR}/../../libc
-           -I${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include
-           $<$<BOOL:${LIBOMPTARGET_GPU_LIBC_SUPPORT}>:-DOMPTARGET_HAS_LIBC>
-           -DSHARED_SCRATCHPAD_SIZE=512
-           -o ${bc_file}
-           ${src_file}
-      DEPENDS ${src_file}
-      COMMENT "Compiling ${src_file} to bitcode"
-    )
-    list(APPEND bc_files ${bc_file})
-  endforeach()
-
-  # Find llvm-link
-  find_program(LLVM_LINK llvm-link HINTS ${LLVM_TOOLS_BINARY_DIR})
-  if(NOT LLVM_LINK)
-    message(FATAL_ERROR "llvm-link not found")
-  endif()
-
-  # Use llvm-link to combine all bitcode files
-  set(output_bc "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc")
-  add_custom_command(
-    OUTPUT ${output_bc}
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
-    COMMAND ${LLVM_LINK} ${bc_files} -o ${output_bc}
-    DEPENDS ${bc_files}
-    COMMENT "Linking bitcode files with llvm-link"
-  )
-
-  # Create a target for the linked bitcode
-  add_custom_target(libompdevice ALL DEPENDS ${output_bc})
-
-  # Install the bitcode file
-  install(FILES ${output_bc}
-          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-          DESTINATION ${OPENMP_INSTALL_LIBDIR})
-else()   
-  # Trick to combine these into a bitcode file via the linker's LTO pass.
-  add_executable(libompdevice ${src_files})
-  set_target_properties(libompdevice PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
-    LINKER_LANGUAGE CXX
-    BUILD_RPATH ""
-    INSTALL_RPATH ""
-    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-
+# Trick to combine these into a bitcode file via the linker's LTO pass.
+add_executable(libompdevice ${src_files})
+set_target_properties(libompdevice PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}"
+  LINKER_LANGUAGE CXX
+  BUILD_RPATH ""
+  INSTALL_RPATH ""
+  RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
+ 
   # If the user built with the GPU C library enabled we will use that instead.
   if(TARGET libc)
     target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
   endif()
   target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-
+  
   target_include_directories(libompdevice PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR}/include
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+                             ${CMAKE_CURRENT_SOURCE_DIR}/include
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
   target_compile_options(libompdevice PRIVATE ${compile_options})
-  target_link_options(libompdevice PRIVATE
+  if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+     NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")  
+    target_link_options(libompdevice PRIVATE
                     "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
+  else()
+    target_link_options(libompdevice PRIVATE
+                    "-nostdlib" "-emit-llvm" "-Wl")
+  endif()
   if(LLVM_DEFAULT_TARGET_TRIPLE)
     target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
   endif()
   install(TARGETS libompdevice
-        PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-        DESTINATION ${OPENMP_INSTALL_LIBDIR})
-endif()
+          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+          DESTINATION ${OPENMP_INSTALL_LIBDIR})
+
 add_library(ompdevice.all_objs OBJECT IMPORTED)
 set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
              ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc)
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index a3a3d7acd172e..2c68109ca544d 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -132,7 +132,7 @@ struct IdentTy {
 using __kmpc_impl_lanemask_t = LaneMaskTy;
 
 #ifdef __SPIRV__
-using FnPtrTy = __attribute__((address_space(9))) void *;
+using FnPtrTy = __attribute__((address_space(ProgramAS))) void *;
 #else
 using FnPtrTy = void *;
 #endif
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index a67323b58f381..6975091b61180 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,6 +16,8 @@
 
 namespace ompx {
 
+// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
+// This is to provide a workaround to use regular printf that is used in the code.
 #if defined(__SPIRV__)
 template <size_t N, typename... Args>
 int printf(const char (&Format)[N], Args... args) {
diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp
index 095d9944531fe..6934387952b7c 100644
--- a/openmp/device/src/LibC.cpp
+++ b/openmp/device/src/LibC.cpp
@@ -48,4 +48,4 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
-#endif
\ No newline at end of file
+#endif
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 385b47e9bf5dd..09edb8dc2d9cc 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -259,24 +259,6 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 ///}
 
 #if defined(__SPIRV__)
-typedef enum {
-  CrossDevice = 0,
-  Device = 1,
-  Workgroup = 2,
-  Subgroup = 3,
-  Invocation = 4
-} Scope_t;
-typedef enum {
-  Relaxed = 0x0,
-  Acquire = 0x2,
-  Release = 0x4,
-  AcquireRelease = 0x8,
-  SequentiallyConsistent = 0x10
-} MemorySemantics_t;
-
-extern "C" uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
-extern "C" void __spirv_MemoryBarrier(int, int);
-extern "C" void __spirv_ControlBarrier(uint32_t, uint32_t, uint32_t);
 
 MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) {
   switch (Ordering) {
@@ -299,57 +281,52 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
   return __spirv_AtomicIAdd(Address, (int)MemScope,
                             convertOrderingType(Ordering), Val);
 }
-void namedBarrierInit() { __builtin_trap(); } // TODO
-void namedBarrier() { __builtin_trap(); }     // TODO
+
+void namedBarrierInit() {} // TODO
+void namedBarrier() {}     // TODO
 void fenceTeam(atomic::OrderingTy Ordering) {
   return __spirv_MemoryBarrier(Scope_t::Workgroup,
-                               convertOrderingType(Ordering));
+                               0x100 | convertOrderingType(Ordering));
 }
 void fenceKernel(atomic::OrderingTy Ordering) {
-  return __spirv_MemoryBarrier(Scope_t::Invocation,
-                               convertOrderingType(Ordering));
+  return __spirv_MemoryBarrier(Scope_t::Device,
+                               0x200 | convertOrderingType(Ordering));
 }
 void fenceSystem(atomic::OrderingTy Ordering) {
-  return __spirv_MemoryBarrier(Scope_t::Device, convertOrderingType(Ordering));
+  return __spirv_MemoryBarrier(Scope_t::CrossDevice, 0x200 | convertOrderingType(Ordering));
 }
 
 void syncWarp(__kmpc_impl_lanemask_t) {
-  __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Subgroup,
-                         MemorySemantics_t::Acquire);
+  __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+                         0x80 | MemorySemantics_t::SequentiallyConsistent);
 }
 void syncThreads(atomic::OrderingTy Ordering) {
-  __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Workgroup,
-                         MemorySemantics_t::Acquire);
+  __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+                         0x100 | convertOrderingType(Ordering));
 }
 void unsetLock(omp_lock_t *Lock) {
-  atomic::store((int32_t *)Lock, 0, atomic::release);
+  __spirv_AtomicStore((int32_t *)Lock, Scope_t::CrossDevice, 0x200 | MemorySemantics_t::SequentiallyConsistent, 0);
 }
 int testLock(omp_lock_t *Lock) {
-  return atomic::add((int32_t *)Lock, 0, atomic::relaxed);
+  int32_t *lock_ptr = (int32_t *)Lock;
+  return __spirv_AtomicCompareExchange(lock_ptr, Scope_t::CrossDevice, 
+            0x200 | MemorySemantics_t::SequentiallyConsistent, 
+            0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0);
 }
 void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void setLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
-  bool acquired = false;
-  int32_t expected;
-  while (!acquired) {
-    expected = 0;
-    if (expected == atomic::load(lock_ptr, atomic::relaxed))
-      acquired =
-          atomic::cas(lock_ptr, expected, 1, atomic::acq_rel, atomic::release);
-  }
+  while(__spirv_AtomicCompareExchange(lock_ptr, Scope_t::CrossDevice, 
+            0x200 | MemorySemantics_t::SequentiallyConsistent, 
+            0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){}
 }
-extern "C" int __attribute__((overloadable)) sub_group_scan_inclusive_min(int);
+
 void unsetCriticalLock(omp_lock_t *Lock) {
-  int id = mapping::getThreadIdInWarp();
-  if (id == sub_group_scan_inclusive_min(id))
-    unsetLock(Lock);
+  unsetLock(Lock);
 }
 void setCriticalLock(omp_lock_t *Lock) {
-  int id = mapping::getThreadIdInWarp();
-  if (id == sub_group_scan_inclusive_min(id))
-    setLock(Lock);
+  setLock(Lock);
 }
 void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
 #endif

>From 5b6f225403da1f61de4048d640ed3ee3e3f1e757 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Tue, 6 Jan 2026 16:34:48 -0800
Subject: [PATCH 03/11] Fix formatting issues

---
 clang/lib/Headers/spirvintrin.h   | 33 ++++++++++++++++---------------
 openmp/device/include/LibC.h      |  2 +-
 openmp/device/include/State.h     |  2 +-
 openmp/device/src/Parallelism.cpp |  2 +-
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index 6784e7ecc2794..e66a2bf0767a6 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -55,8 +55,7 @@ typedef enum {
 using unsigned ProgramAS = 9;
 
 #ifdef __cplusplus
-template <typename... Args>
-int __spirv_ocl_printf(Args...);
+template <typename... Args> int __spirv_ocl_printf(Args...);
 #endif
 
 // Subgroup
@@ -64,12 +63,16 @@ __SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
 __SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
 
 // Group non-uniform operations
-uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope, bool predicate);
-uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope, uint32_t value);
-uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope, uint32_t value, uint32_t id);
+uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope,
+                                       bool predicate);
+uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope,
+                                               uint32_t value);
+uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope,
+                                        uint32_t value, uint32_t id);
 
 // Synchronization
-void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope, uint32_t semantics);
+void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope,
+                            uint32_t semantics);
 void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
 
 // Atomic
@@ -81,17 +84,17 @@ int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
 
 // Returns the number of blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
-   return __spirv_BuiltInNumWorkgroups(0);
+  return __spirv_BuiltInNumWorkgroups(0);
 }
 
 // Returns the number of blocks in the 'y' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
-   return __spirv_BuiltInNumWorkgroups(1);
+  return __spirv_BuiltInNumWorkgroups(1);
 }
 
 // Returns the number of blocks in the 'z' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
-   return __spirv_BuiltInNumWorkgroups(2);
+  return __spirv_BuiltInNumWorkgroups(2);
 }
 
 // Returns the 'x' dimension of the current block's id.
@@ -167,13 +170,13 @@ _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
 }
 // Waits for all the threads in the block to converge and issues a fence.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
-   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup, 
-      0x100 | MemorySemantics_t::SequentiallyConsistent);
+   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+                          0x100 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Waits for all threads in the warp to reconverge for independent scheduling.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
-   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup, 
-      0x80 | MemorySemantics_t::SequentiallyConsistent);
+   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+                          0x80 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
@@ -211,12 +214,10 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns true if the flat pointer points to 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return false; // TODO
-  //return to_local(ptr) != 0;
 }
 // Returns true if the flat pointer points to 'local' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
-  return false;
-  //return to_private(ptr) != 0; // TODO
+  return false; // TODO
 }
 // Terminates execution of the calling thread.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 6975091b61180..8881cf46176fd 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -23,7 +23,7 @@ template <size_t N, typename... Args>
 int printf(const char (&Format)[N], Args... args) {
   return __spirv_ocl_printf(Format, args...);
 }
-#else    
+#else
 int printf(const char *Format, ...);
 #endif
 
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index 338f5a7f8d591..31dc1540d7dd4 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   __builtin_unreachable();
 }
 
-[[gnu::always_inline, gnu::flatten]] inline FnPtrTy&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_ParallelRegionFn:
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index d72e31c328ab9..9f74990ce43ea 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (FnPtrTy)nullptr, true, ident,
+                                          (FnPtrTy) nullptr, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                      /*ForceTeamState=*/true);

>From dc78bad4eb849def92bd3c17f6ee4c5f9157bc93 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 14:40:14 -0800
Subject: [PATCH 04/11] Address PR comments

---
 openmp/device/CMakeLists.txt | 55 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 0dc43ac034225..e09bc886ae60b 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -47,7 +47,7 @@ endif()
 # instructions yet and we end up missing out on way more important constant
 # propagation. That said, we will run the vectorizer again after the runtime
 # has been linked into the user program.
-list(APPEND compile_options -mllvm -vectorize-slp=false)
+list(APPEND compile_options "SHELL: -mllvm -vectorize-slp=false")
 if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
    "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
   set(target_name "amdgpu")
@@ -70,32 +70,33 @@ set_target_properties(libompdevice PROPERTIES
   BUILD_RPATH ""
   INSTALL_RPATH ""
   RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
- 
-  # If the user built with the GPU C library enabled we will use that instead.
-  if(TARGET libc)
-    target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
-  endif()
-  target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-  
-  target_include_directories(libompdevice PRIVATE
-                             ${CMAKE_CURRENT_SOURCE_DIR}/include
-                             ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
-                             ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
-  target_compile_options(libompdevice PRIVATE ${compile_options})
-  if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
-     NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")  
-    target_link_options(libompdevice PRIVATE
-                    "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
-  else()
-    target_link_options(libompdevice PRIVATE
-                    "-nostdlib" "-emit-llvm" "-Wl")
-  endif()
-  if(LLVM_DEFAULT_TARGET_TRIPLE)
-    target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
-  endif()
-  install(TARGETS libompdevice
-          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-          DESTINATION ${OPENMP_INSTALL_LIBDIR})
+
+# If the user built with the GPU C library enabled we will use that instead.
+if(TARGET libc)
+  target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+endif()
+target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+
+target_include_directories(libompdevice PRIVATE
+                           ${CMAKE_CURRENT_SOURCE_DIR}/include
+                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+target_compile_options(libompdevice PRIVATE ${compile_options})
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+   NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  target_link_options(libompdevice PRIVATE
+                  "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
+else()
+  target_link_options(libompdevice PRIVATE
+                  "-nostdlib" "-emit-llvm" "-Wl")
+endif()
+
+if(LLVM_DEFAULT_TARGET_TRIPLE)
+  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+endif()
+install(TARGETS libompdevice
+        PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+        DESTINATION ${OPENMP_INSTALL_LIBDIR})
 
 add_library(ompdevice.all_objs OBJECT IMPORTED)
 set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS

>From e98cf56d4da30aa5e3632401f4267e023e2446f6 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 18:00:41 -0800
Subject: [PATCH 05/11] Fix formatting

---
 clang/lib/Headers/spirvintrin.h       | 23 +++++++++-------------
 openmp/device/include/LibC.h          |  5 +++--
 openmp/device/src/Synchronization.cpp | 28 +++++++++++++--------------
 3 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index e66a2bf0767a6..eebab0cadc951 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -22,7 +22,7 @@ _Pragma("omp begin declare variant match(device = {arch(spirv64)})");
 
 // Type aliases to the address spaces used by the SPIR-V backend.
 //
-#define __gpu_private  __attribute__((address_space(0)))
+#define __gpu_private __attribute__((address_space(0)))
 #define __gpu_constant
 #define __gpu_local
 #define __gpu_global __attribute__((address_space(1)))
@@ -153,7 +153,7 @@ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
 }
  
 // Returns the bit-mask of active threads in the current warp.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
   uint32_t Size = __gpu_num_lanes();
   return ((uint64_t)1 << Size) - (uint64_t)1;
 }
@@ -170,13 +170,13 @@ _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
 }
 // Waits for all the threads in the block to converge and issues a fence.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
-   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
-                          0x100 | MemorySemantics_t::SequentiallyConsistent);
+  __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+                         0x100 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Waits for all threads in the warp to reconverge for independent scheduling.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
-   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
-                          0x80 | MemorySemantics_t::SequentiallyConsistent);
+  __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+                         0x80 | MemorySemantics_t::SequentiallyConsistent);
 }
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
@@ -215,16 +215,11 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return false; // TODO
 }
-// Returns true if the flat pointer points to 'local' memory.
-_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
-  return false; // TODO
-}
+
 // Terminates execution of the calling thread.
-_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
-}
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {}
 // Suspend the thread briefly to assist the scheduler during busy loops.
-_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
-}
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {}
 
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 8881cf46176fd..6aa6b5283ec08 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,8 +16,9 @@
 
 namespace ompx {
 
-// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
-// This is to provide a workaround to use regular printf that is used in the code.
+// SPIR-V backend does not support variadic functions except for
+// __spirv_ocl_printf. This is to provide a workaround to use 
+// regular printf that is used in the code.
 #if defined(__SPIRV__)
 template <size_t N, typename... Args>
 int printf(const char (&Format)[N], Args... args) {
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 09edb8dc2d9cc..bb0de88aa28db 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -293,7 +293,8 @@ void fenceKernel(atomic::OrderingTy Ordering) {
                                0x200 | convertOrderingType(Ordering));
 }
 void fenceSystem(atomic::OrderingTy Ordering) {
-  return __spirv_MemoryBarrier(Scope_t::CrossDevice, 0x200 | convertOrderingType(Ordering));
+  return __spirv_MemoryBarrier(Scope_t::CrossDevice,
+                               0x200 | convertOrderingType(Ordering));
 }
 
 void syncWarp(__kmpc_impl_lanemask_t) {
@@ -305,29 +306,28 @@ void syncThreads(atomic::OrderingTy Ordering) {
                          0x100 | convertOrderingType(Ordering));
 }
 void unsetLock(omp_lock_t *Lock) {
-  __spirv_AtomicStore((int32_t *)Lock, Scope_t::CrossDevice, 0x200 | MemorySemantics_t::SequentiallyConsistent, 0);
+  __spirv_AtomicStore((int32_t *)Lock, Scope_t::CrossDevice,
+                      0x200 | MemorySemantics_t::SequentiallyConsistent, 0);
 }
 int testLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
-  return __spirv_AtomicCompareExchange(lock_ptr, Scope_t::CrossDevice, 
-            0x200 | MemorySemantics_t::SequentiallyConsistent, 
-            0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0);
+  return __spirv_AtomicCompareExchange(
+      lock_ptr, Scope_t::CrossDevice, 
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0);
 }
 void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void setLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
-  while(__spirv_AtomicCompareExchange(lock_ptr, Scope_t::CrossDevice, 
-            0x200 | MemorySemantics_t::SequentiallyConsistent, 
-            0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){}
+  while(__spirv_AtomicCompareExchange(
+      lock_ptr, Scope_t::CrossDevice, 
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){}
 }
 
-void unsetCriticalLock(omp_lock_t *Lock) {
-  unsetLock(Lock);
-}
-void setCriticalLock(omp_lock_t *Lock) {
-  setLock(Lock);
-}
+void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
 #endif
 

>From 40dfa28ab2138d9d099083300b7c10c630e4bd94 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 19:21:06 -0800
Subject: [PATCH 06/11] Allow build of DeviceRTL for SPIRV backend

---
 clang/lib/Headers/gpuintrin.h   | 4 ++--
 clang/lib/Headers/spirvintrin.h | 2 +-
 openmp/CMakeLists.txt           | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 8b75cc14878e3..98d707793d13b 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,10 +60,10 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
-#elif !defined(_OPENMP)
-#error "This header is only meant to be used on GPU architectures."
 #elif defined(__SPIRV__)
 #include <spirvintrin.h>
+#elif !defined(_OPENMP)
+#error "This header is only meant to be used on GPU architectures."
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index eebab0cadc951..e1415debaf155 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -52,7 +52,7 @@ typedef enum {
   SequentiallyConsistent = 0x10
 } MemorySemantics_t;
 
-using unsigned ProgramAS = 9;
+constexpr uint32_t ProgramAS = 9;
 
 #ifdef __cplusplus
 template <typename... Args> int __spirv_ocl_printf(Args...);
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index df568419824a6..355c64d28d296 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -158,8 +158,8 @@ else()
 endif()
 
 # Use the current compiler target to determine the appropriate runtime to build.
-if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
-   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR
+   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64")
   add_subdirectory(device)
 else()
   add_subdirectory(module)

>From 2922a82845fc1b5937188499f6fd7e88c6eafd16 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 19:27:36 -0800
Subject: [PATCH 07/11] Fix format

---
 clang/lib/Headers/spirvintrin.h       |  4 +---
 openmp/device/include/LibC.h          |  2 +-
 openmp/device/src/Synchronization.cpp | 11 ++++++-----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index e1415debaf155..1e242c809a800 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -81,7 +81,6 @@ void __spirv_AtomicStore(int32_t *, int, int, int);
 int32_t __spirv_AtomicLoad(int32_t *, int, int);
 int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
 
-
 // Returns the number of blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __spirv_BuiltInNumWorkgroups(0);
@@ -151,7 +150,7 @@ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
   return __spirv_BuiltInSubgroupLocalInvocationId;
 }
- 
+
 // Returns the bit-mask of active threads in the current warp.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
   uint32_t Size = __gpu_num_lanes();
@@ -204,7 +203,6 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
   return __gpu_match_all_u32_impl(__lane_mask, __x);
 }
 
-
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 6aa6b5283ec08..687561bd49461 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -17,7 +17,7 @@
 namespace ompx {
 
 // SPIR-V backend does not support variadic functions except for
-// __spirv_ocl_printf. This is to provide a workaround to use 
+// __spirv_ocl_printf. This is to provide a workaround to use
 // regular printf that is used in the code.
 #if defined(__SPIRV__)
 template <size_t N, typename... Args>
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index bb0de88aa28db..74290023f7a5f 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -312,8 +312,8 @@ void unsetLock(omp_lock_t *Lock) {
 int testLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
   return __spirv_AtomicCompareExchange(
-      lock_ptr, Scope_t::CrossDevice, 
-      0x200 | MemorySemantics_t::SequentiallyConsistent, 
+      lock_ptr, Scope_t::CrossDevice,
+      0x200 | MemorySemantics_t::SequentiallyConsistent,
       0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0);
 }
 void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
@@ -321,9 +321,10 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void setLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
   while(__spirv_AtomicCompareExchange(
-      lock_ptr, Scope_t::CrossDevice, 
-      0x200 | MemorySemantics_t::SequentiallyConsistent, 
-      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){}
+      lock_ptr, Scope_t::CrossDevice,
+      0x200 | MemorySemantics_t::SequentiallyConsistent,
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){
+  }
 }
 
 void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }

>From a18c3396bf2864a53081c7cea41e3f697ad21409 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 19:33:14 -0800
Subject: [PATCH 08/11] Fix formatting

---
 openmp/device/src/Synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 74290023f7a5f..ee6e71668d563 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -323,7 +323,7 @@ void setLock(omp_lock_t *Lock) {
   while(__spirv_AtomicCompareExchange(
       lock_ptr, Scope_t::CrossDevice,
       0x200 | MemorySemantics_t::SequentiallyConsistent,
-      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)){
+      0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)) {
   }
 }
 

>From ae099bad72be1be53cc517bbfc88c96cb355f321 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 8 Jan 2026 19:40:57 -0800
Subject: [PATCH 09/11] Fix format

---
 openmp/device/src/Synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index ee6e71668d563..384e1a13f1c96 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -320,7 +320,7 @@ void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 void setLock(omp_lock_t *Lock) {
   int32_t *lock_ptr = (int32_t *)Lock;
-  while(__spirv_AtomicCompareExchange(
+  while (__spirv_AtomicCompareExchange(
       lock_ptr, Scope_t::CrossDevice,
       0x200 | MemorySemantics_t::SequentiallyConsistent,
       0x200 | MemorySemantics_t::SequentiallyConsistent, 1, 0)) {

>From 802a65eee300945ecad5dadee6971c8cb6dbe7a3 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Fri, 9 Jan 2026 09:54:10 -0800
Subject: [PATCH 10/11] Add TODO comment

---
 clang/lib/Headers/spirvintrin.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index e4284d818e9e4..46196b76b7a47 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -21,9 +21,10 @@ _Pragma("omp begin declare target device_type(nohost)");
 _Pragma("omp begin declare variant match(device = {arch(spirv64)})");
 
 // Type aliases to the address spaces used by the SPIR-V backend.
+// TODO : Uncomment once SPIR-V backend can handle address space conversions
 #define __gpu_private __attribute__((address_space(0)))
-#define __gpu_constant __attribute__((address_space(2)))
-#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_constant /*__attribute__((address_space(2)))*/
+#define __gpu_local /*__attribute__((address_space(3)))*/
 #define __gpu_global __attribute__((address_space(1)))
 #define __gpu_generic __attribute__((address_space(4)))
 

>From f2dbfa2c8134f742819ce96ef1a1f69f13c641bd Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Fri, 9 Jan 2026 09:57:47 -0800
Subject: [PATCH 11/11] Fix format

---
 clang/lib/Headers/spirvintrin.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index 46196b76b7a47..23b12d025ebf9 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -24,7 +24,7 @@ _Pragma("omp begin declare variant match(device = {arch(spirv64)})");
 // TODO : Uncomment once SPIR-V backend can handle address space conversions
 #define __gpu_private __attribute__((address_space(0)))
 #define __gpu_constant /*__attribute__((address_space(2)))*/
-#define __gpu_local /*__attribute__((address_space(3)))*/
+#define __gpu_local    /*__attribute__((address_space(3)))*/
 #define __gpu_global __attribute__((address_space(1)))
 #define __gpu_generic __attribute__((address_space(4)))
 
@@ -64,7 +64,6 @@ void __spirv_AtomicStore(int32_t *, int, int, int);
 int32_t __spirv_AtomicLoad(int32_t *, int, int);
 int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
 
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_spirv_num_workgroups(0);