[compiler-rt] [openmp] [clang] [TSan][OpenMP][Archer] Treat all reduction operations as atomic (PR #74631)

Wed Dec 6 09:42:45 PST 2023

llvmbot wrote:



@llvm/pr-subscribers-compiler-rt-sanitizer

@llvm/pr-subscribers-clang

Author: Joachim (jprotze)

<details>
<summary>Changes</summary>

This patch rebases https://reviews.llvm.org/D108046 to the new ThreadSanitizer runtime.

The idea of the new ThreadSanitizer Annotation function is to promote all memory accesses to be treated and logged as they would be explicit atomic accesses. I used the performance benchmark from the initial fiber review (https://reviews.llvm.org/D54889#1343582). The TSan-specific changes of this PR increase the execution time from 8.37 to 8.52 seconds on my system, which is a 1.8% runtime increase.

The current tests for this new Annotation are integrated into the tests for the OpenMP-specific tool Archer.

The new Annotations are used in Archer to promote all memory accesses performed to implement an OpenMP reduction as being atomic accesses. With these changes, ThreadSanitizer+Archer successfully detect the race in `openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c`. The challenge in this test is to detect the race between the memory access from the primary thread before the reduction (line 30), which is not synchronized with the OpenMP reduction (line 31).

The OpenMP CodeGen generates three different code patterns for the reduction from which the OpenMP runtime chooses one implementation at runtime. The new analysis is only compatible with two of these code patterns, therefore we skip generation of the third code pattern, if the TSan flag is present during compilation.


---

Patch is 21.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74631.diff


14 Files Affected:

- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+11-3) 
- (modified) compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp (+10) 
- (modified) compiler-rt/lib/tsan/rtl/tsan_rtl.cpp (+15) 
- (modified) compiler-rt/lib/tsan/rtl/tsan_rtl.h (+3) 
- (modified) compiler-rt/lib/tsan/rtl/tsan_shadow.h (+7-2) 
- (modified) openmp/tools/archer/ompt-tsan.cpp (+23-8) 
- (added) openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c (+42) 
- (added) openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c (+42) 
- (added) openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c (+42) 
- (added) openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c (+42) 
- (added) openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c (+45) 
- (added) openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c (+45) 
- (modified) openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c (+1) 
- (modified) openmp/tools/archer/tests/reduction/parallel-reduction.c (+8-6) 


``````````diff

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 55648963df36a..a0381c315e5ec 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4810,13 +4810,16 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
   Args.push_back(&RHSArg);
   const auto &CGFI =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  CodeGenFunction CGF(CGM);
   std::string Name = getReductionFuncName(ReducerName);
   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
                                     llvm::GlobalValue::InternalLinkage, Name,
                                     &CGM.getModule());
+  if (CGF.SanOpts.has(SanitizerKind::Thread)) {
+    return Fn;
+  }
   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   // Dst = (void*[n])(LHSArg);
@@ -5008,6 +5011,11 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
   llvm::Function *ReductionFn = emitReductionFunction(
       CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
       Privates, LHSExprs, RHSExprs, ReductionOps);
+  llvm::Value *ReductionFnP = ReductionFn;
+  if (CGF.SanOpts.has(SanitizerKind::Thread)) {
+    ReductionFnP = llvm::ConstantPointerNull::get(
+        llvm::PointerType::get(ReductionFn->getFunctionType(), 0));
+  }
 
   // 3. Create static kmp_critical_name lock = { 0 };
   std::string Name = getName({"reduction"});
@@ -5026,8 +5034,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
       CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
       ReductionArrayTySize,                  // size_type sizeof(RedList)
       RL,                                    // void *RedList
-      ReductionFn, // void (*) (void *, void *) <reduce_func>
-      Lock         // kmp_critical_name *&<lock>
+      ReductionFnP, // void (*) (void *, void *) <reduce_func>
+      Lock          // kmp_critical_name *&<lock>
   };
   llvm::Value *Res = CGF.EmitRuntimeCall(
       OMPBuilder.getOrCreateRuntimeFunction(
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
index 5154662034c56..a79ed9b0983bd 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
@@ -266,6 +266,16 @@ void INTERFACE_ATTRIBUTE AnnotateBenignRace(
   BenignRaceImpl(f, l, mem, 1, desc);
 }
 
+void INTERFACE_ATTRIBUTE AnnotateAllAtomicBegin(char *f, int l) {
+  SCOPED_ANNOTATION(AnnotateAllAtomicBegin);
+  ThreadAtomicBegin(thr, pc);
+}
+
+void INTERFACE_ATTRIBUTE AnnotateAllAtomicEnd(char *f, int l) {
+  SCOPED_ANNOTATION(AnnotateAllAtomicEnd);
+  ThreadAtomicEnd(thr);
+}
+
 void INTERFACE_ATTRIBUTE AnnotateIgnoreReadsBegin(char *f, int l) {
   SCOPED_ANNOTATION(AnnotateIgnoreReadsBegin);
   ThreadIgnoreBegin(thr, pc);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index fd9441dfcb53c..c829247088f75 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -1053,6 +1053,21 @@ void ThreadIgnoreEnd(ThreadState *thr) {
   }
 }
 
+void ThreadAtomicBegin(ThreadState* thr, uptr pc) {
+  thr->all_atomic++;
+//  CHECK_GT(thr->ignore_reads_and_writes, 0);
+  CHECK_EQ(thr->all_atomic, 1);
+  thr->fast_state.SetAtomicBit();
+}
+
+void ThreadAtomicEnd(ThreadState *thr) {
+  CHECK_GT(thr->all_atomic, 0);
+  thr->all_atomic--;
+  if (thr->all_atomic == 0) {
+    thr->fast_state.ClearAtomicBit();
+  }
+}
+
 #if !SANITIZER_GO
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 uptr __tsan_testonly_shadow_stack_current_size() {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index de4ea0bb5f487..2a86007b47eef 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -182,6 +182,7 @@ struct ThreadState {
   // for better performance.
   int ignore_reads_and_writes;
   int suppress_reports;
+  int all_atomic;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
@@ -550,6 +551,8 @@ void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                          uptr size);
 
+void ThreadAtomicBegin(ThreadState *thr, uptr pc);
+void ThreadAtomicEnd(ThreadState *thr);
 void ThreadIgnoreBegin(ThreadState *thr, uptr pc);
 void ThreadIgnoreEnd(ThreadState *thr);
 void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_shadow.h b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
index 6b8114ef51325..d22545d4fa2ee 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_shadow.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
@@ -9,6 +9,7 @@
 #ifndef TSAN_SHADOW_H
 #define TSAN_SHADOW_H
 
+#include "sanitizer_common/sanitizer_common.h"
 #include "tsan_defs.h"
 
 namespace __tsan {
@@ -21,8 +22,8 @@ class FastState {
     part_.unused0_ = 0;
     part_.sid_ = static_cast<u8>(kFreeSid);
     part_.epoch_ = static_cast<u16>(kEpochLast);
-    part_.unused1_ = 0;
     part_.ignore_accesses_ = false;
+    part_.all_atomic_ = false;
   }
 
   void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
@@ -37,14 +38,18 @@ class FastState {
   void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
   bool GetIgnoreBit() const { return part_.ignore_accesses_; }
 
+  void SetAtomicBit() { part_.all_atomic_ = 1; }
+  void ClearAtomicBit() { part_.all_atomic_ = 0; }
+  bool GetAtomicBit() const { return part_.all_atomic_; }
+
  private:
   friend class Shadow;
   struct Parts {
     u32 unused0_ : 8;
     u32 sid_ : 8;
     u32 epoch_ : kEpochBits;
-    u32 unused1_ : 1;
     u32 ignore_accesses_ : 1;
+    u32 all_atomic_ : 1;
   };
   union {
     Parts part_;
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index 8b338f6b18b6e..ece791683eedd 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -149,7 +149,7 @@ static ArcherFlags *archer_flags;
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
-extern "C" {
+//extern "C" {
 static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
 static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
 static void (*AnnotateIgnoreWritesBegin)(const char *, int);
@@ -159,7 +159,9 @@ static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
 static void (*__tsan_func_entry)(const void *);
 static void (*__tsan_func_exit)(void);
 static int (*RunningOnValgrind)(void);
-}
+static void (*AnnotateReductionBegin)(const char *, int);
+static void (*AnnotateReductionEnd)(const char *, int);
+//}
 
 // This marker is used to define a happens-before arc. The race detector will
 // infer an arc from the begin to the end when they share the same pointer
@@ -175,6 +177,10 @@ static int (*RunningOnValgrind)(void);
 // Resume checking for racy writes.
 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
 
+// Maps to either AnnotateAllAtomics or AnnotateIgnoreWrites 
+#define TsanReductionBegin() AnnotateReductionBegin(__FILE__, __LINE__)
+#define TsanReductionEnd() AnnotateReductionEnd(__FILE__, __LINE__)
+
 // We don't really delete the clock for now
 #define TsanDeleteClock(cv)
 
@@ -718,7 +724,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
         // 2. execution of another task.
         // For the latter case we will re-enable tracking in task_switch.
         Data->InBarrier = true;
-        TsanIgnoreWritesBegin();
+        TsanReductionBegin();
       }
 
       break;
@@ -751,7 +757,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
       if (hasReductionCallback < ompt_set_always) {
         // We want to track writes after the barrier again.
         Data->InBarrier = false;
-        TsanIgnoreWritesEnd();
+        TsanReductionEnd();
       }
 
       char BarrierIndex = Data->BarrierIndex;
@@ -806,7 +812,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
   case ompt_scope_begin:
     switch (kind) {
     case ompt_sync_region_reduction:
-      TsanIgnoreWritesBegin();
+      TsanReductionBegin();
       break;
     default:
       break;
@@ -815,7 +821,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
   case ompt_scope_end:
     switch (kind) {
     case ompt_sync_region_reduction:
-      TsanIgnoreWritesEnd();
+      TsanReductionEnd();
       break;
     default:
       break;
@@ -942,12 +948,12 @@ static void switchTasks(TaskData *FromTask, TaskData *ToTask) {
     if (FromTask && FromTask->InBarrier) {
       // We want to ignore writes in the runtime code during barriers,
       // but not when executing tasks with user code!
-      TsanIgnoreWritesEnd();
+      TsanReductionEnd();
     }
     if (ToTask && ToTask->InBarrier) {
       // We want to ignore writes in the runtime code during barriers,
       // but not when executing tasks with user code!
-      TsanIgnoreWritesBegin();
+      TsanReductionBegin();
     }
   }
   //// Not yet used
@@ -1147,6 +1153,7 @@ static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
   } while (0)
 
 #define findTsanFunctionSilent(f, fSig) f = fSig dlsym(RTLD_DEFAULT, #f)
+#define findTsanFunctionName(f, name, fSig) f = fSig dlsym(RTLD_DEFAULT, #name)
 
 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
                                 ompt_data_t *tool_data) {
@@ -1180,6 +1187,14 @@ static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
       (void (*)(const char *, int, const volatile void *, size_t)));
   findTsanFunction(__tsan_func_entry, (void (*)(const void *)));
   findTsanFunction(__tsan_func_exit, (void (*)(void)));
+  findTsanFunctionName(AnnotateReductionBegin, AnnotateAllAtomicBegin, (void (*)(const char *, int)));
+  findTsanFunctionName(AnnotateReductionEnd, AnnotateAllAtomicEnd, (void (*)(const char *, int)));
+  if (!AnnotateReductionBegin) {
+    AnnotateReductionBegin = AnnotateIgnoreWritesBegin;
+    AnnotateReductionEnd = AnnotateIgnoreWritesEnd;
+    if (archer_flags->verbose)
+      std::cout << "Archer uses fallback solution for reductions: might miss some race" << std::endl;
+  }
 
   SET_CALLBACK(thread_begin);
   SET_CALLBACK(thread_end);
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
new file mode 100644
index 0000000000000..2cdfb811fcbc5
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var[5] = 23;
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
new file mode 100644
index 0000000000000..6ca17fcca6b54
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp for reduction(+ : var) nowait
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+#pragma omp masked
+    var[5] += 23;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
new file mode 100644
index 0000000000000..3cc63cfab1996
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var = 23;
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 100; i++)
+      { var++; }
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
new file mode 100644
index 0000000000000..4c24e8a4f8285
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp for reduction(+ : var) nowait
+    for (int i = 0; i < 100; i++)
+      { var++; }
+#pragma omp masked
+    var = 23;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
new file mode 100644
index 0000000000000..2ac9ec449baa7
--- /dev/null
+++ b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
@@ -0,0 +1,45 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run| FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run| FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var[5] = 23;
+#pragma omp barrier
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+#pragma omp masked
+    var[4] += 42;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 23+100) || (var[4] != 4+100+42);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
new file mode 100644
index 0000000000000..5050684ce2dda
--- /dev/null
+++ b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
@@ -0,0 +1,45 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run| FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run| FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var = 23;
+#pragma omp barrier
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 100; i++)
+      { var++; }
+#pragma omp masked
+    var += 42;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 23+100+42);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c b/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
index b91579f0b00c2..0f6697f213e85 100644
--- a/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
+++ b/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
@@ -37,6 +37,7 @@ int main(int argc, char *argv[]) {
   }
 
   fprintf(stderr, "DONE\n");
+  printf("var = %i\n", var);
   int error = (var != 100);
   return error;
 }
diff --git a/openmp/tools/archer/tests/reduction/parallel-reduction.c b/openmp/tools/archer/tests/reduction/parallel-reduction.c...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/74631