[libclc] [libclc] Move mem_fence and barrier to clc library (PR #151446)
Wenju He via cfe-commits
cfe-commits at lists.llvm.org
Tue Aug 5 03:54:06 PDT 2025
https://github.com/wenju-he updated https://github.com/llvm/llvm-project/pull/151446
>From eed56d228c0613f563c23f9be23d681ef3d87f2b Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Thu, 31 Jul 2025 05:07:23 +0200
Subject: [PATCH 1/3] [libclc] Move mem_fence and barrier to clc library
__clc_mem_fence and __clc_barrier function have two parameters Scope and
MemorySemantics, which are defined in SPIR-V spec. The design allows the
clc functions to implement SPIR-V ControlBarrier and MemoryBarrier
functions in the future.
The default memory ordering in clc is set to SequentiallyConsistent,
which is also the default and strongest ordering in OpenCL and C++.
The default memory scope in clc is set to memory_scope_device for amdgcn
and ptx-nvidiacl since __opencl_c_atomic_scope_all_devices feature macro
is not defined for these targets.
llvm-diff shows no change to amdgcn--amdhsa.bc and nvptx64--nvidiacl.bc.
---
.../clc/include/clc/mem_fence/clc_mem_fence.h | 18 +++++++
.../clc/mem_fence/clc_mem_scope_semantics.h | 36 +++++++++++++
.../include/clc/synchronization/clc_barrier.h | 18 +++++++
libclc/clc/lib/amdgcn/SOURCES | 2 +
.../clc/lib/amdgcn/mem_fence/clc_mem_fence.cl | 37 ++++++++++++++
.../lib/amdgcn/synchronization/clc_barrier.cl | 16 ++++++
libclc/clc/lib/ptx-nvidiacl/SOURCES | 2 +
.../ptx-nvidiacl/mem_fence/clc_mem_fence.cl | 15 ++++++
.../synchronization/clc_barrier.cl | 14 ++++++
.../synchronization/cl_mem_fence_flags.h | 27 ++++++++++
.../clc/opencl/synchronization/utils.h | 50 +++++++++++++++++++
libclc/opencl/lib/amdgcn/mem_fence/fence.cl | 29 ++---------
.../lib/amdgcn/synchronization/barrier.cl | 8 +--
.../lib/ptx-nvidiacl/mem_fence/fence.cl | 7 ++-
.../ptx-nvidiacl/synchronization/barrier.cl | 6 ++-
15 files changed, 255 insertions(+), 30 deletions(-)
create mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_fence.h
create mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
create mode 100644 libclc/clc/include/clc/synchronization/clc_barrier.h
create mode 100644 libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
create mode 100644 libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl
create mode 100644 libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
create mode 100644 libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl
create mode 100644 libclc/opencl/include/clc/opencl/synchronization/utils.h
diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
new file mode 100644
index 0000000000000..f0bbd136955bd
--- /dev/null
+++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
+#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
+
+#include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_scope_semantics.h>
+
+_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(Scope scope,
+ MemorySemantics semantics);
+
+#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
new file mode 100644
index 0000000000000..7294026386b7a
--- /dev/null
+++ b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
+#define __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
+
+// Scope values are defined in SPIR-V spec.
+typedef enum Scope {
+ CrossDevice = 0,
+ Device = 1,
+ Workgroup = 2,
+ Subgroup = 3,
+ Invocation = 4,
+} Scope;
+
+// MemorySemantics values are defined in SPIR-V spec.
+typedef enum MemorySemantics {
+ None = 0x0,
+ Acquire = 0x2,
+ Release = 0x4,
+ AcquireRelease = 0x8,
+ SequentiallyConsistent = 0x10,
+ UniformMemory = 0x40,
+ SubgroupMemory = 0x80,
+ WorkgroupMemory = 0x100,
+ CrossWorkgroupMemory = 0x200,
+ AtomicCounterMemory = 0x400,
+ ImageMemory = 0x800,
+} MemorySemantics;
+
+#endif // __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
diff --git a/libclc/clc/include/clc/synchronization/clc_barrier.h b/libclc/clc/include/clc/synchronization/clc_barrier.h
new file mode 100644
index 0000000000000..d363652c6e14d
--- /dev/null
+++ b/libclc/clc/include/clc/synchronization/clc_barrier.h
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
+#define __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
+
+#include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_scope_semantics.h>
+
+_CLC_OVERLOAD _CLC_DECL void __clc_barrier(Scope scope,
+ MemorySemantics semantics);
+
+#endif // __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES
index 7bec1740f7636..f2f58e3124aa8 100644
--- a/libclc/clc/lib/amdgcn/SOURCES
+++ b/libclc/clc/lib/amdgcn/SOURCES
@@ -1,6 +1,8 @@
math/clc_fmax.cl
math/clc_fmin.cl
math/clc_ldexp_override.cl
+mem_fence/clc_mem_fence.cl
+synchronization/clc_barrier.cl
workitem/clc_get_global_offset.cl
workitem/clc_get_global_size.cl
workitem/clc_get_group_id.cl
diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
new file mode 100644
index 0000000000000..12ec6d8d18091
--- /dev/null
+++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/mem_fence/clc_mem_fence.h>
+
+void __clc_amdgcn_s_waitcnt(unsigned flags);
+
+// s_waitcnt takes 16bit argument with a combined number of maximum allowed
+// pending operations:
+// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
+// [7] -- undefined
+// [6:4] -- exports, GDS, and mem write
+// [3:0] -- vector memory operations
+
+// Newer clang supports __builtin_amdgcn_s_waitcnt
+#if __clang_major__ >= 5
+#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
+#else
+#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
+_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
+#endif
+
+_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope,
+ MemorySemantics semantics) {
+ if (semantics & CrossWorkgroupMemory) {
+ // scalar loads are counted with LGKM but we don't know whether
+ // the compiler turned any loads to scalar
+ __waitcnt(0);
+ } else if (semantics & WorkgroupMemory)
+ __waitcnt(0xff); // LGKM is [12:8]
+}
+#undef __waitcnt
diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl
new file mode 100644
index 0000000000000..0299a426e4d21
--- /dev/null
+++ b/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/mem_fence/clc_mem_fence.h>
+#include <clc/synchronization/clc_barrier.h>
+
+_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope,
+ MemorySemantics semantics) {
+ __clc_mem_fence(scope, semantics);
+ __builtin_amdgcn_s_barrier();
+}
diff --git a/libclc/clc/lib/ptx-nvidiacl/SOURCES b/libclc/clc/lib/ptx-nvidiacl/SOURCES
index 05368c5e4d4e3..a0fb861549ebc 100644
--- a/libclc/clc/lib/ptx-nvidiacl/SOURCES
+++ b/libclc/clc/lib/ptx-nvidiacl/SOURCES
@@ -1,3 +1,5 @@
+mem_fence/clc_mem_fence.cl
+synchronization/clc_barrier.cl
workitem/clc_get_global_id.cl
workitem/clc_get_group_id.cl
workitem/clc_get_local_id.cl
diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
new file mode 100644
index 0000000000000..4c0d342b7244f
--- /dev/null
+++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/mem_fence/clc_mem_fence.h>
+
+_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope,
+ MemorySemantics semantics) {
+ if (semantics & (CrossWorkgroupMemory | WorkgroupMemory))
+ __nvvm_membar_cta();
+}
diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl
new file mode 100644
index 0000000000000..920b17cb02f92
--- /dev/null
+++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/synchronization/clc_barrier.h>
+
+_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope,
+ MemorySemantics semantics) {
+ __syncthreads();
+}
diff --git a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
index 6636515fca47d..18f9a4afb2d5f 100644
--- a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
+++ b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
@@ -11,7 +11,34 @@
typedef uint cl_mem_fence_flags;
+// Copied from
+// https://github.com/llvm/llvm-project/blob/08e40c12fa0c/clang/lib/Headers/opencl-c-base.h#L390
+typedef enum memory_scope {
+ memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+#if defined(__opencl_c_atomic_scope_all_devices)
+ memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
+ memory_scope_all_devices = memory_scope_all_svm_devices,
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >=
+ // 202100)
+#endif // defined(__opencl_c_atomic_scope_all_devices)
+/**
+ * Subgroups have different requirements on forward progress, so just test
+ * all the relevant macros.
+ * CL 3.0 sub-groups "they are not guaranteed to make independent forward
+ * progress" KHR subgroups "Subgroups within a workgroup are independent, make
+ * forward progress with respect to each other"
+ */
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \
+ defined(__opencl_c_subgroups)
+ memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
#define CLK_LOCAL_MEM_FENCE 1
#define CLK_GLOBAL_MEM_FENCE 2
+#define CLK_IMAGE_MEM_FENCE 4
#endif // __CLC_OPENCL_SYNCHRONIZATION_CL_MEM_FENCE_FLAGS_H__
diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h
new file mode 100644
index 0000000000000..098d96d0a8a32
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
+#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
+
+#include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_scope_semantics.h>
+#include <clc/opencl/synchronization/cl_mem_fence_flags.h>
+
+_CLC_INLINE Scope getCLCScope(memory_scope memory_scope) {
+ switch (memory_scope) {
+ case memory_scope_work_item:
+ return Invocation;
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \
+ defined(__opencl_c_subgroups)
+ case memory_scope_sub_group:
+ return Subgroup;
+#endif
+ case memory_scope_work_group:
+ return Workgroup;
+ case memory_scope_device:
+ return Device;
+ default:
+ break;
+ }
+#ifdef __opencl_c_atomic_scope_all_devices
+ return CrossDevice;
+#else
+ return Device;
+#endif
+}
+
+_CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) {
+ MemorySemantics semantics = AcquireRelease;
+ if (flag & CLK_GLOBAL_MEM_FENCE)
+ semantics |= CrossWorkgroupMemory;
+ if (flag & CLK_LOCAL_MEM_FENCE)
+ semantics |= WorkgroupMemory;
+ if (flag & CLK_IMAGE_MEM_FENCE)
+ semantics |= ImageMemory;
+ return semantics;
+}
+
+#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
index 88b953005aae6..10d879d835c06 100644
--- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
+++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
@@ -6,34 +6,15 @@
//
//===----------------------------------------------------------------------===//
+#include <clc/mem_fence/clc_mem_fence.h>
#include <clc/opencl/explicit_fence/explicit_memory_fence.h>
-
-void __clc_amdgcn_s_waitcnt(unsigned flags);
-
-// s_waitcnt takes 16bit argument with a combined number of maximum allowed
-// pending operations:
-// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
-// [7] -- undefined
-// [6:4] -- exports, GDS, and mem write
-// [3:0] -- vector memory operations
-
-// Newer clang supports __builtin_amdgcn_s_waitcnt
-#if __clang_major__ >= 5
-#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
-#else
-#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
-_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
-#endif
+#include <clc/opencl/synchronization/utils.h>
_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
- if (flags & CLK_GLOBAL_MEM_FENCE) {
- // scalar loads are counted with LGKM but we don't know whether
- // the compiler turned any loads to scalar
- __waitcnt(0);
- } else if (flags & CLK_LOCAL_MEM_FENCE)
- __waitcnt(0xff); // LGKM is [12:8]
+ Scope scope = getCLCScope(memory_scope_device);
+ MemorySemantics semantics = getCLCMemorySemantics(flags);
+ __clc_mem_fence(scope, semantics);
}
-#undef __waitcnt
// We don't have separate mechanism for read and write fences
_CLC_DEF _CLC_OVERLOAD void read_mem_fence(cl_mem_fence_flags flags) {
diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
index 5203db72f484c..b8372d4800bf1 100644
--- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
+++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
@@ -6,10 +6,12 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/opencl/explicit_fence/explicit_memory_fence.h>
#include <clc/opencl/synchronization/barrier.h>
+#include <clc/opencl/synchronization/utils.h>
+#include <clc/synchronization/clc_barrier.h>
_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
- mem_fence(flags);
- __builtin_amdgcn_s_barrier();
+ Scope scope = getCLCScope(memory_scope_device);
+ MemorySemantics semantics = getCLCMemorySemantics(flags);
+ __clc_barrier(scope, semantics);
}
diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
index d24569ecda1bc..2d591c90d63c2 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
@@ -6,11 +6,14 @@
//
//===----------------------------------------------------------------------===//
+#include <clc/mem_fence/clc_mem_fence.h>
#include <clc/opencl/explicit_fence/explicit_memory_fence.h>
+#include <clc/opencl/synchronization/utils.h>
_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
- if (flags & (CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE))
- __nvvm_membar_cta();
+ Scope scope = getCLCScope(memory_scope_device);
+ MemorySemantics semantics = getCLCMemorySemantics(flags);
+ __clc_mem_fence(scope, semantics);
}
// We do not have separate mechanism for read and write fences.
diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
index 7c57478795dda..b8372d4800bf1 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
@@ -7,7 +7,11 @@
//===----------------------------------------------------------------------===//
#include <clc/opencl/synchronization/barrier.h>
+#include <clc/opencl/synchronization/utils.h>
+#include <clc/synchronization/clc_barrier.h>
_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
- __syncthreads();
+ Scope scope = getCLCScope(memory_scope_device);
+ MemorySemantics semantics = getCLCMemorySemantics(flags);
+ __clc_barrier(scope, semantics);
}
>From 29ec1763d807d66c5fd3ec19e0ef311e520026e7 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Thu, 31 Jul 2025 07:17:11 +0200
Subject: [PATCH 2/3] default to SequentiallyConsistent
---
libclc/opencl/include/clc/opencl/synchronization/utils.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h
index 098d96d0a8a32..bbcfa20a556d5 100644
--- a/libclc/opencl/include/clc/opencl/synchronization/utils.h
+++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h
@@ -37,7 +37,7 @@ _CLC_INLINE Scope getCLCScope(memory_scope memory_scope) {
}
_CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) {
- MemorySemantics semantics = AcquireRelease;
+ MemorySemantics semantics = SequentiallyConsistent;
if (flag & CLK_GLOBAL_MEM_FENCE)
semantics |= CrossWorkgroupMemory;
if (flag & CLK_LOCAL_MEM_FENCE)
>From 86ca62c88512b55cfd772e762e729f52907ee102 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Tue, 5 Aug 2025 12:45:42 +0200
Subject: [PATCH 3/3] rename clc_barrier to clc_work_group_barrier, replace
SPIR-V constants with clang macros
---
.../clc/include/clc/mem_fence/clc_mem_fence.h | 5 ++-
.../clc/mem_fence/clc_mem_scope_semantics.h | 36 -------------------
...clc_barrier.h => clc_work_group_barrier.h} | 11 +++---
libclc/clc/lib/amdgcn/SOURCES | 2 +-
.../clc/lib/amdgcn/mem_fence/clc_mem_fence.cl | 8 ++---
...c_barrier.cl => clc_work_group_barrier.cl} | 8 ++---
libclc/clc/lib/ptx-nvidiacl/SOURCES | 2 +-
.../ptx-nvidiacl/mem_fence/clc_mem_fence.cl | 6 ++--
...c_barrier.cl => clc_work_group_barrier.cl} | 6 ++--
.../synchronization/cl_mem_fence_flags.h | 26 --------------
.../clc/opencl/synchronization/utils.h | 36 +++----------------
libclc/opencl/lib/amdgcn/mem_fence/fence.cl | 6 ++--
.../lib/amdgcn/synchronization/barrier.cl | 8 ++---
.../lib/ptx-nvidiacl/mem_fence/fence.cl | 6 ++--
.../ptx-nvidiacl/synchronization/barrier.cl | 8 ++---
15 files changed, 42 insertions(+), 132 deletions(-)
delete mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
rename libclc/clc/include/clc/synchronization/{clc_barrier.h => clc_work_group_barrier.h} (55%)
rename libclc/clc/lib/amdgcn/synchronization/{clc_barrier.cl => clc_work_group_barrier.cl} (64%)
rename libclc/clc/lib/ptx-nvidiacl/synchronization/{clc_barrier.cl => clc_work_group_barrier.cl} (66%)
diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
index f0bbd136955bd..2321634c76842 100644
--- a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
+++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
@@ -10,9 +10,8 @@
#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
#include <clc/internal/clc.h>
-#include <clc/mem_fence/clc_mem_scope_semantics.h>
-_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(Scope scope,
- MemorySemantics semantics);
+_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope,
+ int memory_order);
#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
deleted file mode 100644
index 7294026386b7a..0000000000000
--- a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
-#define __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
-
-// Scope values are defined in SPIR-V spec.
-typedef enum Scope {
- CrossDevice = 0,
- Device = 1,
- Workgroup = 2,
- Subgroup = 3,
- Invocation = 4,
-} Scope;
-
-// MemorySemantics values are defined in SPIR-V spec.
-typedef enum MemorySemantics {
- None = 0x0,
- Acquire = 0x2,
- Release = 0x4,
- AcquireRelease = 0x8,
- SequentiallyConsistent = 0x10,
- UniformMemory = 0x40,
- SubgroupMemory = 0x80,
- WorkgroupMemory = 0x100,
- CrossWorkgroupMemory = 0x200,
- AtomicCounterMemory = 0x400,
- ImageMemory = 0x800,
-} MemorySemantics;
-
-#endif // __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__
diff --git a/libclc/clc/include/clc/synchronization/clc_barrier.h b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
similarity index 55%
rename from libclc/clc/include/clc/synchronization/clc_barrier.h
rename to libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
index d363652c6e14d..5f864e1057b8b 100644
--- a/libclc/clc/include/clc/synchronization/clc_barrier.h
+++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
@@ -6,13 +6,12 @@
//
//===----------------------------------------------------------------------===//
-#ifndef __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
-#define __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
+#ifndef __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
+#define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
#include <clc/internal/clc.h>
-#include <clc/mem_fence/clc_mem_scope_semantics.h>
-_CLC_OVERLOAD _CLC_DECL void __clc_barrier(Scope scope,
- MemorySemantics semantics);
+_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope,
+ int memory_order);
-#endif // __CLC_SYNCHRONIZATION_CLC_BARRIER_H__
+#endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES
index f2f58e3124aa8..b20d3db50c416 100644
--- a/libclc/clc/lib/amdgcn/SOURCES
+++ b/libclc/clc/lib/amdgcn/SOURCES
@@ -2,7 +2,7 @@ math/clc_fmax.cl
math/clc_fmin.cl
math/clc_ldexp_override.cl
mem_fence/clc_mem_fence.cl
-synchronization/clc_barrier.cl
+synchronization/clc_work_group_barrier.cl
workitem/clc_get_global_offset.cl
workitem/clc_get_global_size.cl
workitem/clc_get_group_id.cl
diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
index 12ec6d8d18091..9e6460313718e 100644
--- a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
+++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
@@ -25,13 +25,13 @@ void __clc_amdgcn_s_waitcnt(unsigned flags);
_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
#endif
-_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope,
- MemorySemantics semantics) {
- if (semantics & CrossWorkgroupMemory) {
+_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
+ int memory_order) {
+ if (memory_scope & __MEMORY_SCOPE_DEVICE) {
// scalar loads are counted with LGKM but we don't know whether
// the compiler turned any loads to scalar
__waitcnt(0);
- } else if (semantics & WorkgroupMemory)
+ } else if (memory_scope & __MEMORY_SCOPE_WRKGRP)
__waitcnt(0xff); // LGKM is [12:8]
}
#undef __waitcnt
diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
similarity index 64%
rename from libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl
rename to libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
index 0299a426e4d21..ff3628fa7c339 100644
--- a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl
+++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
@@ -7,10 +7,10 @@
//===----------------------------------------------------------------------===//
#include <clc/mem_fence/clc_mem_fence.h>
-#include <clc/synchronization/clc_barrier.h>
+#include <clc/synchronization/clc_work_group_barrier.h>
-_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope,
- MemorySemantics semantics) {
- __clc_mem_fence(scope, semantics);
+_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
+ int memory_order) {
+ __clc_mem_fence(memory_scope, memory_order);
__builtin_amdgcn_s_barrier();
}
diff --git a/libclc/clc/lib/ptx-nvidiacl/SOURCES b/libclc/clc/lib/ptx-nvidiacl/SOURCES
index a0fb861549ebc..b6f50654f89c5 100644
--- a/libclc/clc/lib/ptx-nvidiacl/SOURCES
+++ b/libclc/clc/lib/ptx-nvidiacl/SOURCES
@@ -1,5 +1,5 @@
mem_fence/clc_mem_fence.cl
-synchronization/clc_barrier.cl
+synchronization/clc_work_group_barrier.cl
workitem/clc_get_global_id.cl
workitem/clc_get_group_id.cl
workitem/clc_get_local_id.cl
diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
index 4c0d342b7244f..b3e2375e755a2 100644
--- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
+++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
@@ -8,8 +8,8 @@
#include <clc/mem_fence/clc_mem_fence.h>
-_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope,
- MemorySemantics semantics) {
- if (semantics & (CrossWorkgroupMemory | WorkgroupMemory))
+_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
+ int memory_order) {
+ if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP))
__nvvm_membar_cta();
}
diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
similarity index 66%
rename from libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl
rename to libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
index 920b17cb02f92..6cb37a38f06ac 100644
--- a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl
+++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
@@ -6,9 +6,9 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/synchronization/clc_barrier.h>
+#include <clc/synchronization/clc_work_group_barrier.h>
-_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope,
- MemorySemantics semantics) {
+_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
+ int memory_order) {
__syncthreads();
}
diff --git a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
index 18f9a4afb2d5f..7b2f701c1ff99 100644
--- a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
+++ b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h
@@ -11,32 +11,6 @@
typedef uint cl_mem_fence_flags;
-// Copied from
-// https://github.com/llvm/llvm-project/blob/08e40c12fa0c/clang/lib/Headers/opencl-c-base.h#L390
-typedef enum memory_scope {
- memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
- memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
- memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
-#if defined(__opencl_c_atomic_scope_all_devices)
- memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
- memory_scope_all_devices = memory_scope_all_svm_devices,
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >=
- // 202100)
-#endif // defined(__opencl_c_atomic_scope_all_devices)
-/**
- * Subgroups have different requirements on forward progress, so just test
- * all the relevant macros.
- * CL 3.0 sub-groups "they are not guaranteed to make independent forward
- * progress" KHR subgroups "Subgroups within a workgroup are independent, make
- * forward progress with respect to each other"
- */
-#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \
- defined(__opencl_c_subgroups)
- memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-#endif
-} memory_scope;
-
#define CLK_LOCAL_MEM_FENCE 1
#define CLK_GLOBAL_MEM_FENCE 2
#define CLK_IMAGE_MEM_FENCE 4
diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h
index bbcfa20a556d5..cf3baf28cb5f1 100644
--- a/libclc/opencl/include/clc/opencl/synchronization/utils.h
+++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h
@@ -10,41 +10,15 @@
#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
#include <clc/internal/clc.h>
-#include <clc/mem_fence/clc_mem_scope_semantics.h>
#include <clc/opencl/synchronization/cl_mem_fence_flags.h>
-_CLC_INLINE Scope getCLCScope(memory_scope memory_scope) {
- switch (memory_scope) {
- case memory_scope_work_item:
- return Invocation;
-#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \
- defined(__opencl_c_subgroups)
- case memory_scope_sub_group:
- return Subgroup;
-#endif
- case memory_scope_work_group:
- return Workgroup;
- case memory_scope_device:
- return Device;
- default:
- break;
- }
-#ifdef __opencl_c_atomic_scope_all_devices
- return CrossDevice;
-#else
- return Device;
-#endif
-}
-
-_CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) {
- MemorySemantics semantics = SequentiallyConsistent;
+_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
+ int memory_scope = 0;
if (flag & CLK_GLOBAL_MEM_FENCE)
- semantics |= CrossWorkgroupMemory;
+ memory_scope |= __MEMORY_SCOPE_DEVICE;
if (flag & CLK_LOCAL_MEM_FENCE)
- semantics |= WorkgroupMemory;
- if (flag & CLK_IMAGE_MEM_FENCE)
- semantics |= ImageMemory;
- return semantics;
+ memory_scope |= __MEMORY_SCOPE_WRKGRP;
+ return memory_scope;
}
#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
index 10d879d835c06..81216d6a26cf2 100644
--- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
+++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
@@ -11,9 +11,9 @@
#include <clc/opencl/synchronization/utils.h>
_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
- Scope scope = getCLCScope(memory_scope_device);
- MemorySemantics semantics = getCLCMemorySemantics(flags);
- __clc_mem_fence(scope, semantics);
+ int memory_scope = getCLCMemoryScope(flags);
+ int memory_order = __ATOMIC_SEQ_CST;
+ __clc_mem_fence(memory_scope, memory_order);
}
// We don't have separate mechanism for read and write fences
diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
index b8372d4800bf1..c8322e602302c 100644
--- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
+++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
@@ -8,10 +8,10 @@
#include <clc/opencl/synchronization/barrier.h>
#include <clc/opencl/synchronization/utils.h>
-#include <clc/synchronization/clc_barrier.h>
+#include <clc/synchronization/clc_work_group_barrier.h>
_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
- Scope scope = getCLCScope(memory_scope_device);
- MemorySemantics semantics = getCLCMemorySemantics(flags);
- __clc_barrier(scope, semantics);
+ int memory_scope = getCLCMemoryScope(flags);
+ int memory_order = __ATOMIC_SEQ_CST;
+ __clc_work_group_barrier(memory_scope, memory_order);
}
diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
index 2d591c90d63c2..e22ed870a7e6b 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
@@ -11,9 +11,9 @@
#include <clc/opencl/synchronization/utils.h>
_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
- Scope scope = getCLCScope(memory_scope_device);
- MemorySemantics semantics = getCLCMemorySemantics(flags);
- __clc_mem_fence(scope, semantics);
+ int memory_scope = getCLCMemoryScope(flags);
+ int memory_order = __ATOMIC_SEQ_CST;
+ __clc_mem_fence(memory_scope, memory_order);
}
// We do not have separate mechanism for read and write fences.
diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
index b8372d4800bf1..c8322e602302c 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
@@ -8,10 +8,10 @@
#include <clc/opencl/synchronization/barrier.h>
#include <clc/opencl/synchronization/utils.h>
-#include <clc/synchronization/clc_barrier.h>
+#include <clc/synchronization/clc_work_group_barrier.h>
_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
- Scope scope = getCLCScope(memory_scope_device);
- MemorySemantics semantics = getCLCMemorySemantics(flags);
- __clc_barrier(scope, semantics);
+ int memory_scope = getCLCMemoryScope(flags);
+ int memory_order = __ATOMIC_SEQ_CST;
+ __clc_work_group_barrier(memory_scope, memory_order);
}
More information about the cfe-commits
mailing list