[Openmp-commits] [openmp] r337013 - [OpenMP][libomptarget] Fix data sharing and globalization infrastructure to work in SPMD mode

Gheorghe-Teodor Bercea via Openmp-commits openmp-commits at lists.llvm.org
Fri Jul 13 09:14:22 PDT 2018


Author: gbercea
Date: Fri Jul 13 09:14:22 2018
New Revision: 337013

URL: http://llvm.org/viewvc/llvm-project?rev=337013&view=rev
Log:
[OpenMP][libomptarget] Fix data sharing and globalization infrastructure to work in SPMD mode

Summary: This patch fixes the data sharing infrastructure to work for the SPMD and non-SPMD cases.

Reviewers: ABataev, grokos, carlo.bertolli, caomhin

Reviewed By: ABataev, grokos

Subscribers: guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D49204

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=337013&r1=337012&r2=337013&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Fri Jul 13 09:14:22 2018
@@ -44,7 +44,7 @@ __device__ static bool IsWarpMasterActiv
 }
 // Return true if this is the master thread.
 __device__ static bool IsMasterThread() {
-  return getMasterThreadId() == getThreadId();
+  return !isSPMDMode() && getMasterThreadId() == getThreadId();
 }
 
 /// Return the provided size aligned to the size of a pointer.
@@ -330,39 +330,40 @@ __kmpc_get_data_sharing_environment_fram
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
 
+INLINE void data_sharing_init_stack_common() {
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+  for (int WID = 0; WID < WARPSIZE; WID++) {
+    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
 // Initialize data sharing data structure. This function needs to be called
 // once at the beginning of a data sharing context (coincides with the kernel
-// initialization).
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
 EXTERN void __kmpc_data_sharing_init_stack() {
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
+  data_sharing_init_stack_common();
+  omptarget_nvptx_globalArgs.Init();
+}
 
-  // Initialize the data sharing structures. This section should only be
-  // executed by the warp active master threads.
-  if (IsWarpMasterActiveThread()) {
-    unsigned WID = getWarpId();
-    omptarget_nvptx_TeamDescr *teamDescr =
-        &omptarget_nvptx_threadPrivateContext->TeamContext();
-    __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
-
-    // If a valid address has been returned then proceed with the initalization.
-    // Otherwise the initialization of the slot has already happened in a
-    // previous call to this function.
-    if (RootS) {
-      DataSharingState.SlotPtr[WID] = RootS;
-      DataSharingState.TailPtr[WID] = RootS;
-      DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-    }
-  }
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  if (threadIdx.x == 0)
+    data_sharing_init_stack_common();
 
-  // Currently we only support the sharing of variables between master and
-  // workers. The list of references to shared variables exists only for
-  // the master thread.
-  if (IsMasterThread()) {
-    // Initialize the list of references to arguments.
-    omptarget_nvptx_globalArgs.Init();
-  }
+  __threadfence_block();
 }
 
 // Called at the time of the kernel initialization. This is used to initilize
@@ -372,8 +373,6 @@ EXTERN void __kmpc_data_sharing_init_sta
 // By default the globalized variables are stored in global memory. If the
 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
 // as long as the size requested fits the pre-allocated size.
-//
-// Called by: master, TODO: call by workers
 EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
     int16_t UseSharedMemory) {
   // Frame pointer must be visible to all workers in the same warp.
@@ -385,7 +384,6 @@ EXTERN void* __kmpc_data_sharing_push_st
     // SlotP will point to either the shared memory slot or an existing
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
     void *&StackP = DataSharingState.StackPtr[WID];
 
     // Compute the total memory footprint of the requested data.
@@ -405,62 +403,31 @@ EXTERN void* __kmpc_data_sharing_push_st
     // of the slot then we need to either re-use the next slot, if one exists,
     // or create a new slot.
     if (EndAddress < RequestedEndAddress) {
-      size_t NewSize = PushSize;
-
-      // The new or reused slot for holding the data being pushed.
       __kmpc_data_sharing_slot *NewSlot = 0;
+      size_t NewSize = PushSize;
 
-      // Check if there is a next slot.
-      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-        // Attempt to reuse an existing slot provided the data fits in the slot.
-        // The leftover data space will not be used.
-        ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                     (uintptr_t)(&ExistingSlot->Data[0]);
-
-        // Try to add the data in the next available slot. Search for a slot
-        // with enough space.
-        while (ExistingSlotSize < NewSize) {
-          SlotP->Next = ExistingSlot->Next;
-          SlotP->Next->Prev = ExistingSlot->Prev;
-          free(ExistingSlot);
-          ExistingSlot = SlotP->Next;
-          if (!ExistingSlot)
-            break;
-          ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                             (uintptr_t)(&ExistingSlot->Data[0]);
-        }
-
-        // Check if a slot has been found.
-        if (ExistingSlotSize >= NewSize) {
-          NewSlot = ExistingSlot;
-          NewSlot->PrevSlotStackPtr = StackP;
-        }
-      }
-
-      if (!NewSlot) {
-        // Allocate at least the default size for each type of slot.
-        size_t DefaultSlotSize =
-            IsMasterThread() ? DS_Slot_Size : DS_Worker_Warp_Slot_Size;
-        if (DefaultSlotSize > NewSize)
-          NewSize = DefaultSlotSize;
-        NewSlot = (__kmpc_data_sharing_slot *)malloc(
-            sizeof(__kmpc_data_sharing_slot) + NewSize);
-        NewSlot->Next = 0;
-        NewSlot->Prev = SlotP;
-        NewSlot->PrevSlotStackPtr = StackP;
-        NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-        // Newly allocated slots are also tail slots.
-        TailSlotP = NewSlot;
+      // Allocate at least the default size for each type of slot.
+      // Master is a special case and even though there is only one thread,
+      // it can share more things with the workers. For uniformity, it uses
+      // the full size of a worker warp slot.
+      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+      if (DefaultSlotSize > NewSize)
+        NewSize = DefaultSlotSize;
+      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize,
+          "Global memory slot allocation.");
 
-        // Make previous slot point to the newly allocated slot.
-        SlotP->Next = NewSlot;
-      }
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+      NewSlot->PrevSlotStackPtr = StackP;
+      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
 
+      // Make previous slot point to the newly allocated slot.
+      SlotP->Next = NewSlot;
       // The current slot becomes the new slot.
       SlotP = NewSlot;
       // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[PushSize];
+      StackP = &NewSlot->Data[0] + PushSize;
       // The frame pointer always points to the beginning of the frame.
       FrameP = &NewSlot->Data[0];
     } else {
@@ -489,37 +456,27 @@ EXTERN void __kmpc_data_sharing_pop_stac
   if (IsWarpMasterActiveThread()) {
     unsigned WID = getWarpId();
 
+    // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+    // Pointer to next available stack.
     void *&StackP = DataSharingState.StackPtr[WID];
 
-    // Pop current frame from slot.
-    StackP = FrameStart;
+    // If the current slot is empty, we need to free the slot after the
+    // pop.
+    bool SlotEmpty = (StackP == &SlotP->Data[0]);
 
-    // If we try to pop the last frame of the current slot we need to
-    // move to the previous slot if there is one.
-    const uintptr_t StartAddress = (uintptr_t)FrameStart;
-    if (StartAddress == (uintptr_t)&SlotP->Data[0]) {
-      if (SlotP->Prev) {
-        // The new stack pointer is the end of the data field of the
-        // previous slot. This will allow the stack pointer to be
-        // used in the computation of the remaining data space in
-        // the current slot.
-        StackP = SlotP->PrevSlotStackPtr;
-        // Reset SlotP to previous slot.
-        SlotP = SlotP->Prev;
-      }
+    // Pop the frame.
+    StackP = FrameStart;
 
-      // If this will "pop" the last global memory node then it is likely
-      // that we are at the end of the data sharing region and we can
-      // de-allocate any existing global memory slots.
-      if (!SlotP->Prev) {
-        __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
-        while(Tail->Prev) {
-          Tail = Tail->Prev;
-          free(Tail->Next);
-        }
-        Tail->Next=0;
-      }
+    if (SlotEmpty && SlotP->Prev) {
+      // Before removing the slot we need to reset StackP.
+      StackP = SlotP->PrevSlotStackPtr;
+
+      // Remove the slot.
+      SlotP = SlotP->Prev;
+      SafeFree(SlotP->Next, "Free slot.");
+      SlotP->Next = 0;
     }
   }
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=337013&r1=337012&r2=337013&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Fri Jul 13 09:14:22 2018
@@ -485,6 +485,7 @@ EXTERN void __kmpc_kernel_end_convergent
 
 
 EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=337013&r1=337012&r2=337013&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Jul 13 09:14:22 2018
@@ -122,7 +122,6 @@ enum DATA_SHARING_SIZES {
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
   void *FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
@@ -302,6 +301,16 @@ public:
     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
   }
 
+  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
 private:
   omptarget_nvptx_TaskDescr
       levelZeroTaskDescr; // icv for team master initial thread
@@ -311,7 +320,7 @@ private:
   uint64_t lastprivateIterBuffer;
 
   __align__(16)
-      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1];
+      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
   __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
 };
 




More information about the Openmp-commits mailing list