[clang] [llvm] [openmp] [OpenMP][offload] Fix dynamic schedule tracking (PR #97065)

Gheorghe-Teodor Bercea via cfe-commits cfe-commits at lists.llvm.org
Fri Jun 28 10:24:44 PDT 2024


================
@@ -444,32 +444,81 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
 // KMP interface implementation (dyn loops)
 ////////////////////////////////////////////////////////////////////////////////
 
-// TODO: This is a stopgap. We probably want to expand the dispatch API to take
-//       an DST pointer which can then be allocated properly without malloc.
-static DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
+// TODO: Expand the dispatch API to take a DST pointer which can then be
+//       allocated properly without malloc.
+// For now, each team will contain an LDS pointer (ThreadDST) to a global array
+// of references to the DST structs allocated (in global memory) for each thread
+// in the team. The global memory array is allocated during the init phase if it
+// was not allocated already and will be deallocated when the dispatch phase
+// ends:
+//
+//  __kmpc_dispatch_init
+//
+//  ** Dispatch loop **
+//
+//  __kmpc_dispatch_deinit
+//
+static DynamicScheduleTracker **SHARED(ThreadDST);
 
 // Create a new DST, link the current one, and define the new as current.
 static DynamicScheduleTracker *pushDST() {
+  int32_t ThreadIndex = mapping::getThreadIdInBlock();
+  // Each block will allocate an array of pointers to DST structs. The array is
+  // equal in length to the number of threads in that block.
+  if (!ThreadDST) {
+    // Allocate global memory array of pointers to DST structs:
+    if (ThreadIndex == 0)
+      ThreadDST = static_cast<DynamicScheduleTracker **>(
+          memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
+                                  sizeof(DynamicScheduleTracker *),
+                              "new ThreadDST array"));
+    synchronize::threads(atomic::seq_cst);
+
+    // Initialize the array pointers:
+    ThreadDST[ThreadIndex] = nullptr;
+  }
+
+  // Create a DST struct for the current thread:
   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
   *NewDST = DynamicScheduleTracker({0});
-  NewDST->NextDST = ThreadDSTPtr;
-  ThreadDSTPtr = NewDST;
-  return ThreadDSTPtr;
+
+  // Add the new DST struct to the array of DST structs:
+  NewDST->NextDST = ThreadDST[ThreadIndex];
+  ThreadDST[ThreadIndex] = NewDST;
+  return NewDST;
 }
 
 // Return the current DST.
-static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
+static DynamicScheduleTracker *peekDST() {
+  return ThreadDST[mapping::getThreadIdInBlock()];
+}
 
 // Pop the current DST and restore the last one.
 static void popDST() {
-  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
-  memory::freeGlobal(ThreadDSTPtr, "remove DST");
-  ThreadDSTPtr = OldDST;
+  int32_t ThreadIndex = mapping::getThreadIdInBlock();
+  DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
+  DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
+  memory::freeGlobal(CurrentDST, "remove DST");
+  ThreadDST[ThreadIndex] = OldDST;
+
+  // Check if we need to deallocate the global array. Ensure all threads
+  // in the block have finished deallocating the individual DSTs.
+  synchronize::threads(atomic::seq_cst);
+  if (ThreadDST[ThreadIndex] == 0 && ThreadIndex == 0)
+    memory::freeGlobal(ThreadDST, "remove ThreadDST array");
+  synchronize::threads(atomic::seq_cst);
----------------
doru1004 wrote:

For performance reasons?

https://github.com/llvm/llvm-project/pull/97065


More information about the cfe-commits mailing list