[Openmp-commits] [openmp] [OFFLOAD] Add spirv implementation for named barrier (PR #180393)

via Openmp-commits openmp-commits at lists.llvm.org
Wed Mar 4 18:07:04 PST 2026


https://github.com/fineg74 updated https://github.com/llvm/llvm-project/pull/180393

>From 2ddf1f718c7319c5b18efb9102ca4267b481e10e Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:16:41 -0800
Subject: [PATCH 1/7] Add spirv implementation for named barrier

---
 openmp/device/src/Synchronization.cpp | 43 +++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 3554226d2ee75..7a065b0a27fcf 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -181,8 +181,47 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 ///}
 
 #if defined(__SPIRV__)
-void namedBarrierInit() { __builtin_trap(); } // TODO
-void namedBarrier() { __builtin_trap(); }     // TODO
+
+[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
+
+void namedBarrierInit() {
+  atomic::store(&namedBarrierTracker, 0u, atomic::seq_cst);
+}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  
+  // Uses two 16 bit unsigned counters. One for the number of threads to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of threads, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  // Increment the low 16 bits once.
+
+  uint32_t load = atomic::add(&namedBarrierTracker, 1,
+                              atomic::seq_cst); 
+
+  // Record the number of times the barrier has been passed
+  uint32_t generation = load & 0xffff0000u;
+
+  if ((load & 0x0000ffffu) == (NumThreads - 1)) {
+    // Reached NumWaves in low bits so this is the last wave.
+    // Set low bits to zero and increment high bits
+    load += 0x00010000u; // wrap is safe
+    load &= 0xffff0000u; // because bits zeroed second
+
+    // Reset the wave counter and release the waiting waves
+    atomic::store(&namedBarrierTracker, load, atomic::seq_cst);
+  } else {
+    // more waves still to go, spin until generation counter changes
+    do {
+      load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
+    } while ((load & 0xffff0000u) == generation);
+  }
+  __gpu_sync_threads();
+
+}
 
 void unsetLock(omp_lock_t *Lock) {
   atomic::store((int32_t *)Lock, 0, atomic::seq_cst);

>From 7ada5b0b4a848192b914873639feb54211f314f6 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:23:01 -0800
Subject: [PATCH 2/7] Fix formatting

---
 openmp/device/src/Synchronization.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 7a065b0a27fcf..cebc9ea2e5796 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -199,8 +199,7 @@ void namedBarrier() {
 
   // Increment the low 16 bits once.
 
-  uint32_t load = atomic::add(&namedBarrierTracker, 1,
-                              atomic::seq_cst); 
+  uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst); 
 
   // Record the number of times the barrier has been passed
   uint32_t generation = load & 0xffff0000u;
@@ -220,7 +219,6 @@ void namedBarrier() {
     } while ((load & 0xffff0000u) == generation);
   }
   __gpu_sync_threads();
-
 }
 
 void unsetLock(omp_lock_t *Lock) {

>From b138f5115021ee19710531b5209455b2e2014b11 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Sat, 7 Feb 2026 23:28:46 -0800
Subject: [PATCH 3/7] Fix formatting

---
 openmp/device/src/Synchronization.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index cebc9ea2e5796..1053ccf0ea092 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -190,7 +190,7 @@ void namedBarrierInit() {
 
 void namedBarrier() {
   uint32_t NumThreads = omp_get_num_threads();
-  
+
   // Uses two 16 bit unsigned counters. One for the number of threads to have
   // reached the barrier, and one to count how many times the barrier has been
   // passed. These are packed in a single atomically accessed 32 bit integer.
@@ -199,7 +199,7 @@ void namedBarrier() {
 
   // Increment the low 16 bits once.
 
-  uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst); 
+  uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
 
   // Record the number of times the barrier has been passed
   uint32_t generation = load & 0xffff0000u;

>From 23cc317881c3feaf42a7ce6012be19529d79f59c Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 25 Feb 2026 12:30:22 -0800
Subject: [PATCH 4/7] Simplify implementation

---
 openmp/device/src/Synchronization.cpp | 36 ++++++++-------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 1053ccf0ea092..8734467f7b360 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -190,33 +190,17 @@ void namedBarrierInit() {
 
 void namedBarrier() {
   uint32_t NumThreads = omp_get_num_threads();
+  uint32_t ThreadId = mapping::getThreadIdInBlock();
+  if (ThreadId < NumThreads) {
+    uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
 
-  // Uses two 16 bit unsigned counters. One for the number of threads to have
-  // reached the barrier, and one to count how many times the barrier has been
-  // passed. These are packed in a single atomically accessed 32 bit integer.
-  // Low bits for the number of threads, assumed zero before this call.
-  // High bits to count the number of times the barrier has been passed.
-
-  // Increment the low 16 bits once.
-
-  uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);
-
-  // Record the number of times the barrier has been passed
-  uint32_t generation = load & 0xffff0000u;
-
-  if ((load & 0x0000ffffu) == (NumThreads - 1)) {
-    // Reached NumWaves in low bits so this is the last wave.
-    // Set low bits to zero and increment high bits
-    load += 0x00010000u; // wrap is safe
-    load &= 0xffff0000u; // because bits zeroed second
-
-    // Reset the wave counter and release the waiting waves
-    atomic::store(&namedBarrierTracker, load, atomic::seq_cst);
-  } else {
-    // more waves still to go, spin until generation counter changes
-    do {
-      load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
-    } while ((load & 0xffff0000u) == generation);
+    if (load == NumThreads - 1) {
+      atomic::store(&namedBarrierTracker, 0, atomic::seq_cst);
+    } else {
+      do {
+        load = atomic::load(&namedBarrierTracker, atomic::seq_cst);
+      } while (load != 0);
+    }
   }
   __gpu_sync_threads();
 }

>From fd2a8b4233162d8d93c07e3892daf5aeb2c84f0a Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 14:16:32 -0800
Subject: [PATCH 5/7] Tweak number of threads calculation

---
 openmp/device/src/Synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 8734467f7b360..dee36b7e56027 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
 }
 
 void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t NumThreads = mapping::getMaxTeamThreads();
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (ThreadId < NumThreads) {
     uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);

>From b5ba18bcfc412d9e2271fa0663f657f38d8e442b Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 15:54:53 -0800
Subject: [PATCH 6/7] Revert "Tweak number of threads calculation"

This reverts commit fd2a8b4233162d8d93c07e3892daf5aeb2c84f0a.
---
 openmp/device/src/Synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index dee36b7e56027..8734467f7b360 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
 }
 
 void namedBarrier() {
-  uint32_t NumThreads = mapping::getMaxTeamThreads();
+  uint32_t NumThreads = omp_get_num_threads();
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (ThreadId < NumThreads) {
     uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);

>From 2553bb43399865b1e9b3e8779f0f863701897458 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Wed, 4 Mar 2026 18:06:33 -0800
Subject: [PATCH 7/7] Revert "Revert "Tweak number of threads calculation""

This reverts commit b5ba18bcfc412d9e2271fa0663f657f38d8e442b.
---
 openmp/device/src/Synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 8734467f7b360..dee36b7e56027 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -189,7 +189,7 @@ void namedBarrierInit() {
 }
 
 void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t NumThreads = mapping::getMaxTeamThreads();
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (ThreadId < NumThreads) {
     uint32_t load = atomic::add(&namedBarrierTracker, 1, atomic::seq_cst);



More information about the Openmp-commits mailing list