[clang] [openmp] [compiler-rt] [TSan][OpenMP][Archer] Treat all reduction operations as atomic (PR #74631)

Wed Dec 6 11:22:08 PST 2023

https://github.com/felilxtomski updated https://github.com/llvm/llvm-project/pull/74631

>From 669ddace7494027779b2501805353577efa1ea18 Mon Sep 17 00:00:00 2001
From: Joachim Jenke <jenke at itc.rwth-aachen.de>
Date: Tue, 11 Jul 2023 14:40:17 +0200
Subject: [PATCH 1/2] Treat all reduction operations as atomic

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 14 ++++--
 .../lib/tsan/rtl/tsan_interface_ann.cpp       | 10 +++++
 compiler-rt/lib/tsan/rtl/tsan_rtl.cpp         | 15 +++++++
 compiler-rt/lib/tsan/rtl/tsan_rtl.h           |  3 ++
 compiler-rt/lib/tsan/rtl/tsan_shadow.h        |  9 +++-
 openmp/tools/archer/ompt-tsan.cpp             | 31 +++++++++----
 .../parallel-for-array-reduction-no-barrier.c | 42 +++++++++++++++++
 .../parallel-for-array-reduction-nowait.c     | 42 +++++++++++++++++
 .../races/parallel-for-reduction-no-barrier.c | 42 +++++++++++++++++
 .../races/parallel-for-reduction-nowait.c     | 42 +++++++++++++++++
 .../parallel-for-array-reduction-barrier.c    | 45 +++++++++++++++++++
 .../parallel-for-reduction-barrier.c          | 45 +++++++++++++++++++
 .../reduction/parallel-reduction-nowait.c     |  1 +
 .../tests/reduction/parallel-reduction.c      | 14 +++---
 14 files changed, 336 insertions(+), 19 deletions(-)
 create mode 100644 openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
 create mode 100644 openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
 create mode 100644 openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
 create mode 100644 openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
 create mode 100644 openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
 create mode 100644 openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 55648963df36a..a0381c315e5ec 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4810,13 +4810,16 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
   Args.push_back(&RHSArg);
   const auto &CGFI =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  CodeGenFunction CGF(CGM);
   std::string Name = getReductionFuncName(ReducerName);
   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
                                     llvm::GlobalValue::InternalLinkage, Name,
                                     &CGM.getModule());
+  if (CGF.SanOpts.has(SanitizerKind::Thread)) {
+    return Fn;
+  }
   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   // Dst = (void*[n])(LHSArg);
@@ -5008,6 +5011,11 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
   llvm::Function *ReductionFn = emitReductionFunction(
       CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
       Privates, LHSExprs, RHSExprs, ReductionOps);
+  llvm::Value *ReductionFnP = ReductionFn;
+  if (CGF.SanOpts.has(SanitizerKind::Thread)) {
+    ReductionFnP = llvm::ConstantPointerNull::get(
+        llvm::PointerType::get(ReductionFn->getFunctionType(), 0));
+  }
 
   // 3. Create static kmp_critical_name lock = { 0 };
   std::string Name = getName({"reduction"});
@@ -5026,8 +5034,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
       CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
       ReductionArrayTySize,                  // size_type sizeof(RedList)
       RL,                                    // void *RedList
-      ReductionFn, // void (*) (void *, void *) <reduce_func>
-      Lock         // kmp_critical_name *&<lock>
+      ReductionFnP, // void (*) (void *, void *) <reduce_func>
+      Lock          // kmp_critical_name *&<lock>
   };
   llvm::Value *Res = CGF.EmitRuntimeCall(
       OMPBuilder.getOrCreateRuntimeFunction(
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
index 5154662034c56..a79ed9b0983bd 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
@@ -266,6 +266,16 @@ void INTERFACE_ATTRIBUTE AnnotateBenignRace(
   BenignRaceImpl(f, l, mem, 1, desc);
 }
 
+void INTERFACE_ATTRIBUTE AnnotateAllAtomicBegin(char *f, int l) {
+  SCOPED_ANNOTATION(AnnotateAllAtomicBegin);
+  ThreadAtomicBegin(thr, pc);
+}
+
+void INTERFACE_ATTRIBUTE AnnotateAllAtomicEnd(char *f, int l) {
+  SCOPED_ANNOTATION(AnnotateAllAtomicEnd);
+  ThreadAtomicEnd(thr);
+}
+
 void INTERFACE_ATTRIBUTE AnnotateIgnoreReadsBegin(char *f, int l) {
   SCOPED_ANNOTATION(AnnotateIgnoreReadsBegin);
   ThreadIgnoreBegin(thr, pc);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index fd9441dfcb53c..c829247088f75 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -1053,6 +1053,21 @@ void ThreadIgnoreEnd(ThreadState *thr) {
   }
 }
 
+void ThreadAtomicBegin(ThreadState* thr, uptr pc) {
+  thr->all_atomic++;
+//  CHECK_GT(thr->ignore_reads_and_writes, 0);
+  CHECK_EQ(thr->all_atomic, 1);
+  thr->fast_state.SetAtomicBit();
+}
+
+void ThreadAtomicEnd(ThreadState *thr) {
+  CHECK_GT(thr->all_atomic, 0);
+  thr->all_atomic--;
+  if (thr->all_atomic == 0) {
+    thr->fast_state.ClearAtomicBit();
+  }
+}
+
 #if !SANITIZER_GO
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 uptr __tsan_testonly_shadow_stack_current_size() {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index de4ea0bb5f487..2a86007b47eef 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -182,6 +182,7 @@ struct ThreadState {
   // for better performance.
   int ignore_reads_and_writes;
   int suppress_reports;
+  int all_atomic;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
@@ -550,6 +551,8 @@ void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                          uptr size);
 
+void ThreadAtomicBegin(ThreadState *thr, uptr pc);
+void ThreadAtomicEnd(ThreadState *thr);
 void ThreadIgnoreBegin(ThreadState *thr, uptr pc);
 void ThreadIgnoreEnd(ThreadState *thr);
 void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_shadow.h b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
index 6b8114ef51325..d22545d4fa2ee 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_shadow.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
@@ -9,6 +9,7 @@
 #ifndef TSAN_SHADOW_H
 #define TSAN_SHADOW_H
 
+#include "sanitizer_common/sanitizer_common.h"
 #include "tsan_defs.h"
 
 namespace __tsan {
@@ -21,8 +22,8 @@ class FastState {
     part_.unused0_ = 0;
     part_.sid_ = static_cast<u8>(kFreeSid);
     part_.epoch_ = static_cast<u16>(kEpochLast);
-    part_.unused1_ = 0;
     part_.ignore_accesses_ = false;
+    part_.all_atomic_ = false;
   }
 
   void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
@@ -37,14 +38,18 @@ class FastState {
   void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
   bool GetIgnoreBit() const { return part_.ignore_accesses_; }
 
+  void SetAtomicBit() { part_.all_atomic_ = 1; }
+  void ClearAtomicBit() { part_.all_atomic_ = 0; }
+  bool GetAtomicBit() const { return part_.all_atomic_; }
+
  private:
   friend class Shadow;
   struct Parts {
     u32 unused0_ : 8;
     u32 sid_ : 8;
     u32 epoch_ : kEpochBits;
-    u32 unused1_ : 1;
     u32 ignore_accesses_ : 1;
+    u32 all_atomic_ : 1;
   };
   union {
     Parts part_;
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index 8b338f6b18b6e..ece791683eedd 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -149,7 +149,7 @@ static ArcherFlags *archer_flags;
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
-extern "C" {
+//extern "C" {
 static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
 static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
 static void (*AnnotateIgnoreWritesBegin)(const char *, int);
@@ -159,7 +159,9 @@ static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
 static void (*__tsan_func_entry)(const void *);
 static void (*__tsan_func_exit)(void);
 static int (*RunningOnValgrind)(void);
-}
+static void (*AnnotateReductionBegin)(const char *, int);
+static void (*AnnotateReductionEnd)(const char *, int);
+//}
 
 // This marker is used to define a happens-before arc. The race detector will
 // infer an arc from the begin to the end when they share the same pointer
@@ -175,6 +177,10 @@ static int (*RunningOnValgrind)(void);
 // Resume checking for racy writes.
 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
 
+// Maps to either AnnotateAllAtomics or AnnotateIgnoreWrites 
+#define TsanReductionBegin() AnnotateReductionBegin(__FILE__, __LINE__)
+#define TsanReductionEnd() AnnotateReductionEnd(__FILE__, __LINE__)
+
 // We don't really delete the clock for now
 #define TsanDeleteClock(cv)
 
@@ -718,7 +724,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
         // 2. execution of another task.
         // For the latter case we will re-enable tracking in task_switch.
         Data->InBarrier = true;
-        TsanIgnoreWritesBegin();
+        TsanReductionBegin();
       }
 
       break;
@@ -751,7 +757,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
       if (hasReductionCallback < ompt_set_always) {
         // We want to track writes after the barrier again.
         Data->InBarrier = false;
-        TsanIgnoreWritesEnd();
+        TsanReductionEnd();
       }
 
       char BarrierIndex = Data->BarrierIndex;
@@ -806,7 +812,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
   case ompt_scope_begin:
     switch (kind) {
     case ompt_sync_region_reduction:
-      TsanIgnoreWritesBegin();
+      TsanReductionBegin();
       break;
     default:
       break;
@@ -815,7 +821,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
   case ompt_scope_end:
     switch (kind) {
     case ompt_sync_region_reduction:
-      TsanIgnoreWritesEnd();
+      TsanReductionEnd();
       break;
     default:
       break;
@@ -942,12 +948,12 @@ static void switchTasks(TaskData *FromTask, TaskData *ToTask) {
     if (FromTask && FromTask->InBarrier) {
       // We want to ignore writes in the runtime code during barriers,
       // but not when executing tasks with user code!
-      TsanIgnoreWritesEnd();
+      TsanReductionEnd();
     }
     if (ToTask && ToTask->InBarrier) {
       // We want to ignore writes in the runtime code during barriers,
       // but not when executing tasks with user code!
-      TsanIgnoreWritesBegin();
+      TsanReductionBegin();
     }
   }
   //// Not yet used
@@ -1147,6 +1153,7 @@ static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
   } while (0)
 
 #define findTsanFunctionSilent(f, fSig) f = fSig dlsym(RTLD_DEFAULT, #f)
+#define findTsanFunctionName(f, name, fSig) f = fSig dlsym(RTLD_DEFAULT, #name)
 
 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
                                 ompt_data_t *tool_data) {
@@ -1180,6 +1187,14 @@ static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
       (void (*)(const char *, int, const volatile void *, size_t)));
   findTsanFunction(__tsan_func_entry, (void (*)(const void *)));
   findTsanFunction(__tsan_func_exit, (void (*)(void)));
+  findTsanFunctionName(AnnotateReductionBegin, AnnotateAllAtomicBegin, (void (*)(const char *, int)));
+  findTsanFunctionName(AnnotateReductionEnd, AnnotateAllAtomicEnd, (void (*)(const char *, int)));
+  if (!AnnotateReductionBegin) {
+    AnnotateReductionBegin = AnnotateIgnoreWritesBegin;
+    AnnotateReductionEnd = AnnotateIgnoreWritesEnd;
+    if (archer_flags->verbose)
+      std::cout << "Archer uses fallback solution for reductions: might miss some race" << std::endl;
+  }
 
   SET_CALLBACK(thread_begin);
   SET_CALLBACK(thread_end);
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
new file mode 100644
index 0000000000000..2cdfb811fcbc5
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var[5] = 23;
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
new file mode 100644
index 0000000000000..6ca17fcca6b54
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp for reduction(+ : var) nowait
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+#pragma omp masked
+    var[5] += 23;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
new file mode 100644
index 0000000000000..3cc63cfab1996
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var = 23;
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 100; i++)
+      { var++; }
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
new file mode 100644
index 0000000000000..4c24e8a4f8285
--- /dev/null
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
@@ -0,0 +1,42 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp for reduction(+ : var) nowait
+    for (int i = 0; i < 100; i++)
+      { var++; }
+#pragma omp masked
+    var = 23;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 123);
+  return error;
+}
+
+// CHECK: ThreadSanitizer: data race
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
new file mode 100644
index 0000000000000..2ac9ec449baa7
--- /dev/null
+++ b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
@@ -0,0 +1,45 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run| FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run| FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var[10]={0,1,2,3,4,5,6,7,8,9};
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var[5] = 23;
+#pragma omp barrier
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 1000; i++)
+      { var[i%10]++; }
+#pragma omp masked
+    var[4] += 42;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var[5] != 23+100) || (var[4] != 4+100+42);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
new file mode 100644
index 0000000000000..5050684ce2dda
--- /dev/null
+++ b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
@@ -0,0 +1,45 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run| FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run| FileCheck %s
+
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  
+#pragma omp parallel
+  {
+#pragma omp masked
+    var = 23;
+#pragma omp barrier
+#pragma omp for reduction(+ : var)
+    for (int i = 0; i < 100; i++)
+      { var++; }
+#pragma omp masked
+    var += 42;
+  }
+  fprintf(stderr, "DONE\n");
+  int error = (var != 23+100+42);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c b/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
index b91579f0b00c2..0f6697f213e85 100644
--- a/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
+++ b/openmp/tools/archer/tests/reduction/parallel-reduction-nowait.c
@@ -37,6 +37,7 @@ int main(int argc, char *argv[]) {
   }
 
   fprintf(stderr, "DONE\n");
+  printf("var = %i\n", var);
   int error = (var != 100);
   return error;
 }
diff --git a/openmp/tools/archer/tests/reduction/parallel-reduction.c b/openmp/tools/archer/tests/reduction/parallel-reduction.c
index 6d1a556ac00ed..afb2863235d32 100644
--- a/openmp/tools/archer/tests/reduction/parallel-reduction.c
+++ b/openmp/tools/archer/tests/reduction/parallel-reduction.c
@@ -11,22 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Number of threads is empirical: We need enough (>4) threads so that
+// the reduction is really performed hierarchically in the barrier!
+
+// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run| FileCheck %s
+// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run| FileCheck %s
 
-// RUN: %libarcher-compile-and-run| FileCheck %s
 // REQUIRES: tsan
 #include <omp.h>
 #include <stdio.h>
 
 int main(int argc, char *argv[]) {
   int var = 0;
-
-// Number of threads is empirical: We need enough threads so that
-// the reduction is really performed hierarchically in the barrier!
-#pragma omp parallel num_threads(5) reduction(+ : var)
+  
+#pragma omp parallel reduction(+ : var)
   { var = 1; }
 
   fprintf(stderr, "DONE\n");
-  int error = (var != 5);
+  int error = (var != omp_get_max_threads());
   return error;
 }
 

>From e9ead10753caf0ac2e087466eccf58e91bec891a Mon Sep 17 00:00:00 2001
From: "felix.tomski" <tomski at itc.rwth-aachen.de>
Date: Wed, 6 Dec 2023 20:21:46 +0100
Subject: [PATCH 2/2] Fix format

---
 compiler-rt/lib/tsan/rtl/tsan_rtl.cpp              |  4 ++--
 openmp/tools/archer/ompt-tsan.cpp                  | 14 +++++++++-----
 .../parallel-for-array-reduction-no-barrier.c      |  9 +++++----
 .../races/parallel-for-array-reduction-nowait.c    |  9 +++++----
 .../races/parallel-for-reduction-no-barrier.c      |  7 ++++---
 .../tests/races/parallel-for-reduction-nowait.c    |  7 ++++---
 .../parallel-for-array-reduction-barrier.c         | 11 ++++++-----
 .../reduction/parallel-for-reduction-barrier.c     |  9 +++++----
 .../archer/tests/reduction/parallel-reduction.c    |  2 +-
 9 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index c829247088f75..cdbe4cb3442ac 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -1055,12 +1055,12 @@ void ThreadIgnoreEnd(ThreadState *thr) {
 
 void ThreadAtomicBegin(ThreadState* thr, uptr pc) {
   thr->all_atomic++;
-//  CHECK_GT(thr->ignore_reads_and_writes, 0);
+  //  CHECK_GT(thr->ignore_reads_and_writes, 0);
   CHECK_EQ(thr->all_atomic, 1);
   thr->fast_state.SetAtomicBit();
 }
 
-void ThreadAtomicEnd(ThreadState *thr) {
+void ThreadAtomicEnd(ThreadState* thr) {
   CHECK_GT(thr->all_atomic, 0);
   thr->all_atomic--;
   if (thr->all_atomic == 0) {
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index ece791683eedd..673b23aca8ab5 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -149,7 +149,7 @@ static ArcherFlags *archer_flags;
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
-//extern "C" {
+// extern "C" {
 static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
 static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
 static void (*AnnotateIgnoreWritesBegin)(const char *, int);
@@ -177,7 +177,7 @@ static void (*AnnotateReductionEnd)(const char *, int);
 // Resume checking for racy writes.
 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
 
-// Maps to either AnnotateAllAtomics or AnnotateIgnoreWrites 
+// Maps to either AnnotateAllAtomics or AnnotateIgnoreWrites
 #define TsanReductionBegin() AnnotateReductionBegin(__FILE__, __LINE__)
 #define TsanReductionEnd() AnnotateReductionEnd(__FILE__, __LINE__)
 
@@ -1187,13 +1187,17 @@ static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
       (void (*)(const char *, int, const volatile void *, size_t)));
   findTsanFunction(__tsan_func_entry, (void (*)(const void *)));
   findTsanFunction(__tsan_func_exit, (void (*)(void)));
-  findTsanFunctionName(AnnotateReductionBegin, AnnotateAllAtomicBegin, (void (*)(const char *, int)));
-  findTsanFunctionName(AnnotateReductionEnd, AnnotateAllAtomicEnd, (void (*)(const char *, int)));
+  findTsanFunctionName(AnnotateReductionBegin, AnnotateAllAtomicBegin,
+                       (void (*)(const char *, int)));
+  findTsanFunctionName(AnnotateReductionEnd, AnnotateAllAtomicEnd,
+                       (void (*)(const char *, int)));
   if (!AnnotateReductionBegin) {
     AnnotateReductionBegin = AnnotateIgnoreWritesBegin;
     AnnotateReductionEnd = AnnotateIgnoreWritesEnd;
     if (archer_flags->verbose)
-      std::cout << "Archer uses fallback solution for reductions: might miss some race" << std::endl;
+      std::cout << "Archer uses fallback solution for reductions: might miss "
+                   "some race"
+                << std::endl;
   }
 
   SET_CALLBACK(thread_begin);
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
index 2cdfb811fcbc5..511a21013da4d 100644
--- a/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c
@@ -22,15 +22,16 @@
 #include <stdio.h>
 
 int main(int argc, char *argv[]) {
-  int var[10]={0,1,2,3,4,5,6,7,8,9};
-  
+  int var[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
 #pragma omp parallel
   {
 #pragma omp masked
     var[5] = 23;
 #pragma omp for reduction(+ : var)
-    for (int i = 0; i < 1000; i++)
-      { var[i%10]++; }
+    for (int i = 0; i < 1000; i++) {
+      var[i % 10]++;
+    }
   }
   fprintf(stderr, "DONE\n");
   int error = (var[5] != 123);
diff --git a/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
index 6ca17fcca6b54..bdac83d7ca6b2 100644
--- a/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
+++ b/openmp/tools/archer/tests/races/parallel-for-array-reduction-nowait.c
@@ -22,13 +22,14 @@
 #include <stdio.h>
 
 int main(int argc, char *argv[]) {
-  int var[10]={0,1,2,3,4,5,6,7,8,9};
-  
+  int var[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
 #pragma omp parallel
   {
 #pragma omp for reduction(+ : var) nowait
-    for (int i = 0; i < 1000; i++)
-      { var[i%10]++; }
+    for (int i = 0; i < 1000; i++) {
+      var[i % 10]++;
+    }
 #pragma omp masked
     var[5] += 23;
   }
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
index 3cc63cfab1996..3ea874a0deb6f 100644
--- a/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-no-barrier.c
@@ -23,14 +23,15 @@
 
 int main(int argc, char *argv[]) {
   int var = 0;
-  
+
 #pragma omp parallel
   {
 #pragma omp masked
     var = 23;
 #pragma omp for reduction(+ : var)
-    for (int i = 0; i < 100; i++)
-      { var++; }
+    for (int i = 0; i < 100; i++) {
+      var++;
+    }
   }
   fprintf(stderr, "DONE\n");
   int error = (var != 123);
diff --git a/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
index 4c24e8a4f8285..11afceb671a53 100644
--- a/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
+++ b/openmp/tools/archer/tests/races/parallel-for-reduction-nowait.c
@@ -23,12 +23,13 @@
 
 int main(int argc, char *argv[]) {
   int var = 0;
-  
+
 #pragma omp parallel
   {
 #pragma omp for reduction(+ : var) nowait
-    for (int i = 0; i < 100; i++)
-      { var++; }
+    for (int i = 0; i < 100; i++) {
+      var++;
+    }
 #pragma omp masked
     var = 23;
   }
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
index 2ac9ec449baa7..d5d7ca525093c 100644
--- a/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
+++ b/openmp/tools/archer/tests/reduction/parallel-for-array-reduction-barrier.c
@@ -22,21 +22,22 @@
 #include <stdio.h>
 
 int main(int argc, char *argv[]) {
-  int var[10]={0,1,2,3,4,5,6,7,8,9};
-  
+  int var[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
 #pragma omp parallel
   {
 #pragma omp masked
     var[5] = 23;
 #pragma omp barrier
 #pragma omp for reduction(+ : var)
-    for (int i = 0; i < 1000; i++)
-      { var[i%10]++; }
+    for (int i = 0; i < 1000; i++) {
+      var[i % 10]++;
+    }
 #pragma omp masked
     var[4] += 42;
   }
   fprintf(stderr, "DONE\n");
-  int error = (var[5] != 23+100) || (var[4] != 4+100+42);
+  int error = (var[5] != 23 + 100) || (var[4] != 4 + 100 + 42);
   return error;
 }
 
diff --git a/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
index 5050684ce2dda..5ee6928161b5c 100644
--- a/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
+++ b/openmp/tools/archer/tests/reduction/parallel-for-reduction-barrier.c
@@ -23,20 +23,21 @@
 
 int main(int argc, char *argv[]) {
   int var = 0;
-  
+
 #pragma omp parallel
   {
 #pragma omp masked
     var = 23;
 #pragma omp barrier
 #pragma omp for reduction(+ : var)
-    for (int i = 0; i < 100; i++)
-      { var++; }
+    for (int i = 0; i < 100; i++) {
+      var++;
+    }
 #pragma omp masked
     var += 42;
   }
   fprintf(stderr, "DONE\n");
-  int error = (var != 23+100+42);
+  int error = (var != 23 + 100 + 42);
   return error;
 }
 
diff --git a/openmp/tools/archer/tests/reduction/parallel-reduction.c b/openmp/tools/archer/tests/reduction/parallel-reduction.c
index afb2863235d32..887fe2e018281 100644
--- a/openmp/tools/archer/tests/reduction/parallel-reduction.c
+++ b/openmp/tools/archer/tests/reduction/parallel-reduction.c
@@ -23,7 +23,7 @@
 
 int main(int argc, char *argv[]) {
   int var = 0;
-  
+
 #pragma omp parallel reduction(+ : var)
   { var = 1; }