[Openmp-commits] [llvm] [openmp] [OFFLOAD] Add spirv implementation for named barrier (PR #180393)
via Openmp-commits
openmp-commits at lists.llvm.org
Tue Mar 10 09:14:56 PDT 2026
https://github.com/fineg74 updated https://github.com/llvm/llvm-project/pull/180393
>From 2ddf1f718c7319c5b18efb9102ca4267b481e10e Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:16:41 -0800
Subject: [PATCH 1/9] Add spirv implementation for named barrier
---
openmp/device/src/Synchronization.cpp | 43 +++++++++++++++++++++++++--
1 file changed, 41 insertions(+), 2 deletions(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 3554226d2ee75..7a065b0a27fcf 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -181,8 +181,47 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
///}
#if defined(__SPIRV__)
-void namedBarrierInit() { __builtin_trap(); } // TODO
-void namedBarrier() { __builtin_trap(); } // TODO
+
+[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
+
+void namedBarrierInit() {
+ atomic::store(&namedBarrierTracker, 0u, atomic::seq_cst);
+}
+
+void namedBarrier() {
+ uint32_t NumThreads = omp_get_num_threads();
+
+ // Uses two 16 bit unsigned counters. One for the number of threads to have
+ // reached the barrier, and one to count how many times the barrier has been
+ // passed. These are packed in a single atomically accessed 32 bit integer.
+ // Low bits for the number of threads, assumed zero before this call.
+ // High bits to count the number of times the barrier has been passed.
+
+ // Increment the low 16 bits once.
+
+ uint32_t load = atomic::add(&namedBarrierTracker, 1,
+ atomic::seq_cst);
+
+ // Record the number of times the barrier has been passed
+ uint32_t generation = load & 0xffff0000u;
+
+ if ((load & 0x0000ffffu) == (NumThreads - 1)) {
+ // Reached NumWaves in low bits so this is the last wave.
+ // Set low bits to zero and increment high bits
+ load += 0x00010000u; // wrap is safe
+ load &= 0xffff0000u; // because bits zeroed second
+
+ // Reset the wave counter and release the waiting waves
+ atomic::store(&namedBarrierTracker, load, atomic::seq_cst);
+ } else {
+ // more waves still to go, spin until generation counter changes
+ do {
+ load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
+ } while ((load & 0xffff0000u) == generation);
+ }
+ __gpu_sync_threads();
+
+}
void unsetLock(omp_lock_t *Lock) {
atomic::store((int32_t *)Lock, 0, atomic::seq_cst);
>From 7ada5b0b4a848192b914873639feb54211f314f6 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:23:01 -0800
Subject: [PATCH 2/9] Fix formatting
---
openmp/device/src/Synchronization.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 7a065b0a27fcf..cebc9ea2e5796 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -199,8 +199,7 @@ void namedBarrier() {
// Increment the low 16 bits once.
- uint32_t load = atomic::add(&namedBarrierTracker, 1,
- atomic::seq_cst);
+ uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
// Record the number of times the barrier has been passed
uint32_t generation = load & 0xffff0000u;
@@ -220,7 +219,6 @@ void namedBarrier() {
} while ((load & 0xffff0000u) == generation);
}
__gpu_sync_threads();
-
}
void unsetLock(omp_lock_t *Lock) {
>From b138f5115021ee19710531b5209455b2e2014b11 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:28:46 -0800
Subject: [PATCH 3/9] Fix formatting
---
openmp/device/src/Synchronization.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index cebc9ea2e5796..1053ccf0ea092 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -190,7 +190,7 @@ void namedBarrierInit() {
void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();
-
+
// Uses two 16 bit unsigned counters. One for the number of threads to have
// reached the barrier, and one to count how many times the barrier has been
// passed. These are packed in a single atomically accessed 32 bit integer.
@@ -199,7 +199,7 @@ void namedBarrier() {
// Increment the low 16 bits once.
- uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
+ uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
// Record the number of times the barrier has been passed
uint32_t generation = load & 0xffff0000u;
>From 23cc317881c3feaf42a7ce6012be19529d79f59c Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 25 Feb 2026 12:30:22 -0800
Subject: [PATCH 4/9] Simplify implementation
---
openmp/device/src/Synchronization.cpp | 36 ++++++++-------------------
1 file changed, 10 insertions(+), 26 deletions(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 1053ccf0ea092..8734467f7b360 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -190,33 +190,17 @@ void namedBarrierInit() {
void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();
+ uint32_t ThreadId = mapping::getThreadIdInBlock();
+ if (ThreadId < NumThreads) {
+ uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
- // Uses two 16 bit unsigned counters. One for the number of threads to have
- // reached the barrier, and one to count how many times the barrier has been
- // passed. These are packed in a single atomically accessed 32 bit integer.
- // Low bits for the number of threads, assumed zero before this call.
- // High bits to count the number of times the barrier has been passed.
-
- // Increment the low 16 bits once.
-
- uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
-
- // Record the number of times the barrier has been passed
- uint32_t generation = load & 0xffff0000u;
-
- if ((load & 0x0000ffffu) == (NumThreads - 1)) {
- // Reached NumWaves in low bits so this is the last wave.
- // Set low bits to zero and increment high bits
- load += 0x00010000u; // wrap is safe
- load &= 0xffff0000u; // because bits zeroed second
-
- // Reset the wave counter and release the waiting waves
- atomic::store(&namedBarrierTracker, load, atomic::seq_cst);
- } else {
- // more waves still to go, spin until generation counter changes
- do {
- load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
- } while ((load & 0xffff0000u) == generation);
+ if (load == NumThreads - 1) {
+ atomic::store(&namedBarrierTracker, 0, atomic::seq_cst);
+ } else {
+ do {
+ load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
+ } while (load != 0);
+ }
}
__gpu_sync_threads();
}
>From fd2a8b4233162d8d93c07e3892daf5aeb2c84f0a Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 14:16:32 -0800
Subject: [PATCH 5/9] Tweak number of threads calculation
---
openmp/device/src/Synchronization.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 8734467f7b360..dee36b7e56027 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
}
void namedBarrier() {
- uint32_t NumThreads = omp_get_num_threads();
+ uint32_t NumThreads = mapping::getMaxTeamThreads();
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (ThreadId < NumThreads) {
uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
>From b5ba18bcfc412d9e2271fa0663f657f38d8e442b Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 15:54:53 -0800
Subject: [PATCH 6/9] Revert "Tweak number of threads calculation"
This reverts commit fd2a8b4233162d8d93c07e3892daf5aeb2c84f0a.
---
openmp/device/src/Synchronization.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index dee36b7e56027..8734467f7b360 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
}
void namedBarrier() {
- uint32_t NumThreads = mapping::getMaxTeamThreads();
+ uint32_t NumThreads = omp_get_num_threads();
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (ThreadId < NumThreads) {
uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
>From 2553bb43399865b1e9b3e8779f0f863701897458 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 18:06:33 -0800
Subject: [PATCH 7/9] Revert "Revert "Tweak number of threads calculation""
This reverts commit b5ba18bcfc412d9e2271fa0663f657f38d8e442b.
---
openmp/device/src/Synchronization.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 8734467f7b360..dee36b7e56027 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
}
void namedBarrier() {
- uint32_t NumThreads = omp_get_num_threads();
+ uint32_t NumThreads = mapping::getMaxTeamThreads();
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (ThreadId < NumThreads) {
uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
>From 98971b72ba669c0c3f8ae628f76e8da8f3750125 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Fri, 6 Mar 2026 16:42:30 -0800
Subject: [PATCH 8/9] Simplify the implementation
---
openmp/device/src/Synchronization.cpp | 21 +++++++++------------
1 file changed, 9 insertions(+), 12 deletions(-)
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index dee36b7e56027..0fa6abd751f33 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,18 +189,15 @@ void namedBarrierInit() {
}
void namedBarrier() {
- uint32_t NumThreads = mapping::getMaxTeamThreads();
- uint32_t ThreadId = mapping::getThreadIdInBlock();
- if (ThreadId < NumThreads) {
- uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
-
- if (load == NumThreads - 1) {
- atomic::store(&namedBarrierTracker, 0, atomic::seq_cst);
- } else {
- do {
- load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
- } while (load != 0);
- }
+ uint32_t NumThreads = omp_get_num_threads();
+ uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
+
+ if (load == NumThreads - 1) {
+ atomic::store(&namedBarrierTracker, 0, atomic::seq_cst);
+ } else {
+ do {
+ load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
+ } while (load != 0);
}
__gpu_sync_threads();
}
>From cc9cee4dbce84bf936895ecfbac185f77546cedd Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Tue, 10 Mar 2026 09:14:41 -0700
Subject: [PATCH 9/9] Disable hanging tests
---
offload/test/mapping/firstprivate_aligned.cpp | 1 +
offload/test/offloading/atomic-compare-signedness.c | 2 +-
offload/test/offloading/bug49021.cpp | 2 +-
offload/test/sanitizer/kernel_trap_async.c | 2 +-
offload/test/sanitizer/use_after_free_1.c | 2 +-
offload/test/sanitizer/use_after_free_2.c | 2 +-
6 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/offload/test/mapping/firstprivate_aligned.cpp b/offload/test/mapping/firstprivate_aligned.cpp
index ae6be0f0c07f4..4521f4397112b 100644
--- a/offload/test/mapping/firstprivate_aligned.cpp
+++ b/offload/test/mapping/firstprivate_aligned.cpp
@@ -1,4 +1,5 @@
// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
+// UNSUPPORTED: intelgpu
#include <stdio.h>
diff --git a/offload/test/offloading/atomic-compare-signedness.c b/offload/test/offloading/atomic-compare-signedness.c
index e1c8a2f846a8f..2fd6a484460d0 100644
--- a/offload/test/offloading/atomic-compare-signedness.c
+++ b/offload/test/offloading/atomic-compare-signedness.c
@@ -7,7 +7,7 @@
// RUN: %libomptarget-run-generic | %fcheck-generic
// RUN: %libomptarget-compileopt-generic -fopenmp-version=51
// RUN: %libomptarget-run-generic | %fcheck-generic
-// XFAIL: intelgpu
+// UNSUPPORTED: intelgpu
// High parallelism increases our chances of detecting a lack of atomicity.
#define NUM_THREADS_TRY 256
diff --git a/offload/test/offloading/bug49021.cpp b/offload/test/offloading/bug49021.cpp
index 1bd35c66f5db2..6d919871f0d26 100644
--- a/offload/test/offloading/bug49021.cpp
+++ b/offload/test/offloading/bug49021.cpp
@@ -3,7 +3,7 @@
// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && %libomptarget-run-generic
// RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
// RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && %libomptarget-run-generic
-// XFAIL: intelgpu
+// UNSUPPORTED: intelgpu
// clang-format on
#include <iostream>
diff --git a/offload/test/sanitizer/kernel_trap_async.c b/offload/test/sanitizer/kernel_trap_async.c
index ae1bb8d322e4c..aa31c6371fa0c 100644
--- a/offload/test/sanitizer/kernel_trap_async.c
+++ b/offload/test/sanitizer/kernel_trap_async.c
@@ -12,7 +12,7 @@
// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: x86_64-unknown-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu
-// XFAIL: intelgpu
+// UNSUPPORTED: intelgpu
#include <omp.h>
diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c
index 927e54ddab2bd..2dee3d107a115 100644
--- a/offload/test/sanitizer/use_after_free_1.c
+++ b/offload/test/sanitizer/use_after_free_1.c
@@ -9,7 +9,7 @@
// UNSUPPORTED: nvidiagpu
//
// REQUIRES: gpu
-// XFAIL: intelgpu
+// UNSUPPORTED: intelgpu
#include <omp.h>
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index ece0cfe60875d..3ba621524e566 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -7,7 +7,7 @@
// UNSUPPORTED: nvidiagpu
//
// REQUIRES: gpu
-// XFAIL: intelgpu
+// UNSUPPORTED: intelgpu
// If offload memory pooling is enabled for a large allocation, reuse error is
// not detected. UNSUPPORTED: large_allocation_memory_pool
More information about the Openmp-commits
mailing list