[Openmp-commits] [openmp] [OpenMP] Fix endian bug in hierarchical barrier code (PR #117073)
Ulrich Weigand via Openmp-commits
openmp-commits at lists.llvm.org
Wed Nov 20 14:52:19 PST 2024
https://github.com/uweigand updated https://github.com/llvm/llvm-project/pull/117073
>From 77acb609c1b306afe285ce43208de9d2744d0529 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand at de.ibm.com>
Date: Wed, 20 Nov 2024 23:22:39 +0100
Subject: [PATCH] [OpenMP] Fix endian bug in hierarchical barrier code
The "oncore" flavor of the hierarchical barrier code uses a
single uint64 to hold both a barrier state *and* a set
of flags for the current node's "leaf kids" in the tree.
To make this work, the barrier state is placed in the
least-significant byte of the uint64, and the flags for
leaf kids are placed in other bytes of the uint64
starting from the most-significant byte to represent
the first child.
At least, this is how it works on little-endian systems.
On big-endian hosts, the current code unfortunately
places the leaf kid flags starting from the *least*
significant byte of the uint64, where they will overlap
and clobber the barrier state byte, resulting in
corrupted barrier behavior.
To fix this, this PR changes the child flag offsets to
always start with the MSB, on both big- and little-
endian hosts. This was complicated a bit by the following
issues:
- The byte offset was stored "off-by-one" for some reason.
This makes it impossible to represent the MSB on big-
endian systems (i.e. byte 0, which would have to be
represented by -1 if offset by one, which does not
fit into the uint8 offset field).
Fixed by storing and using the "offset" field as the
actual byte number, not off-by-one.
- The "offset" field was never used for the master node,
but it was still computed. That computed value was also
invalid as it pointed outside the target uint64, but that
didn't matter as it was unused. For big-endian systems,
however, the computation would have resulted in a
*negative* value, which again cannot be stored in the uint8
field.
Fixed by skipping the (unused) computation on the master
node and setting the "offset" field to a dummy value.
This is intended to be NFC on little-endian machines.
Fixes: https://github.com/llvm/llvm-project/issues/116215
---
openmp/runtime/src/kmp_barrier.cpp | 31 ++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index d7ef57c608149e..5d8307ca4e2740 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1292,6 +1292,15 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
kmp_bstate_t *thr_bar,
kmp_uint32 nproc, int gtid,
int tid, kmp_team_t *team) {
+ // Helper macro to identify bytes in a kmp_uint64 in an endian-independent
+ // way. Input 0 results in the byte address of the MSB, input 7 results
+ // in the byte address of the LSB.
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define __kmp_msb_byteoffset(offset) (offset)
+#else
+#define __kmp_msb_byteoffset(offset) (7 - (offset))
+#endif
+
// Checks to determine if (re-)initialization is needed
bool uninitialized = thr_bar->team == NULL;
bool team_changed = team != thr_bar->team;
@@ -1306,6 +1315,7 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
if (uninitialized || team_sz_changed || tid_changed) {
thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
thr_bar->parent_tid = -1; // default for primary thread
+ thr_bar->offset = -1; // unused for primary thread
if (!KMP_MASTER_TID(tid)) {
// if not primary thread, find parent thread in hierarchy
kmp_uint32 d = 0;
@@ -1325,10 +1335,14 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
}
++d;
}
+
+ kmp_uint32 offset = ((kmp_uint32)tid - thr_bar->parent_tid) /
+ thr_bar->skip_per_level[thr_bar->my_level];
+ offset = offset - 1;
+ KMP_ASSERT(offset < 7);
+ __kmp_type_convert(__kmp_msb_byteoffset(offset), &(thr_bar->offset));
}
- __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
- (thr_bar->skip_per_level[thr_bar->my_level])),
- &(thr_bar->offset));
+
thr_bar->old_tid = tid;
thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
thr_bar->team = team;
@@ -1350,9 +1364,11 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
__kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
thr_bar->leaf_state = 0;
for (int i = 0; i < thr_bar->leaf_kids; ++i)
- ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
+ ((char *)&(thr_bar->leaf_state))[__kmp_msb_byteoffset(i)] = 1;
}
return retval;
+
+#undef __kmp_msb_byteoffset
}
static void __kmp_hierarchical_barrier_gather(
@@ -1506,8 +1522,7 @@ static void __kmp_hierarchical_barrier_gather(
} else {
// Leaf does special release on "offset" bits of parent's b_arrived flag
thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
- kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
- thr_bar->offset + 1);
+ kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
flag.set_waiter(other_threads[thr_bar->parent_tid]);
flag.release();
}
@@ -1555,7 +1570,7 @@ static void __kmp_hierarchical_barrier_release(
// Wait on my "offset" bits on parent's b_go flag
thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
- thr_bar->offset + 1, bt,
+ thr_bar->offset, bt,
this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
flag.wait(this_thr, TRUE);
if (thr_bar->wait_flag ==
@@ -1564,7 +1579,7 @@ static void __kmp_hierarchical_barrier_release(
KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
} else { // Reset my bits on parent's b_go flag
(RCAST(volatile char *,
- &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
+ &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
}
}
thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
More information about the Openmp-commits
mailing list