[Openmp-commits] [openmp] r345867 - [OpenMP][libomptarget] Add runtime function for pushing coalesced global records

Gheorghe-Teodor Bercea via Openmp-commits openmp-commits at lists.llvm.org
Thu Nov 1 11:08:12 PDT 2018


Author: gbercea
Date: Thu Nov  1 11:08:12 2018
New Revision: 345867

URL: http://llvm.org/viewvc/llvm-project?rev=345867&view=rev
Log:
[OpenMP][libomptarget] Add runtime function for pushing coalesced global records

Summary: In the case of coalesced global records, we need to push the exact data size passed in. This patch fixes this by outlining the common functionality of the previous push function and by adding a separate entry point for coalesced pushes. The pop function remains unchanged.

Reviewers: ABataev, grokos, caomhin

Reviewed By: ABataev, grokos

Subscribers: jholewinski, cfe-commits, Hahnfeld, guansong, jfb, openmp-commits

Differential Revision: https://reviews.llvm.org/D53141

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=345867&r1=345866&r2=345867&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Thu Nov  1 11:08:12 2018
@@ -129,7 +129,7 @@ EXTERN void *__kmpc_data_sharing_environ
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
 
   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@ EXTERN void __kmpc_data_sharing_environm
 
       __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
       void *&StackP = DataSharingState.StackPtr[WID];
-      void *&FrameP = DataSharingState.FramePtr[WID];
+      void * volatile &FrameP = DataSharingState.FramePtr[WID];
 
       SlotP = *SavedSharedSlot;
       StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@ __kmpc_get_data_sharing_environment_fram
 
   DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
 
-  void *P = DataSharingState.FramePtr[SourceWID];
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
@@ -369,47 +369,31 @@ EXTERN void __kmpc_data_sharing_init_sta
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
     ASSERT0(LT_FUSSY, isSPMDMode(),
             "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignment = 8;
-  if (DataSize % Alignment != 0) {
-    DataSize += (Alignment - DataSize % Alignment);
-  }
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
   unsigned WID = getWarpId();
-  void *&FrameP = DataSharingState.FramePtr[WID];
+  void *volatile &FrameP = DataSharingState.FramePtr[WID];
 
-  // Only warp active master threads manage the stack.
-  if (getThreadId() % WARPSIZE == 0) {
+  if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
     void *&StackP = DataSharingState.StackPtr[WID];
 
-    // Compute the total memory footprint of the requested data.
-    // The master thread requires a stack only for itself. A worker
-    // thread (which at this point is a warp master) will require
-    // space for the variables of each thread in the warp,
-    // i.e. one DataSize chunk per warp lane.
-    // TODO: change WARPSIZE to the number of active threads in the warp.
-    size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
     // Check if we have room for the data in the current slot.
     const uintptr_t StartAddress = (uintptr_t)StackP;
     const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@ EXTERN void* __kmpc_data_sharing_push_st
       // Reset stack pointer to the requested address.
       StackP = (void *)RequestedEndAddress;
     }
+  } else {
+    while (!FrameP);
   }
 
-  __threadfence_block();
+  return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+      DataSize : WARPSIZE * DataSize;
 
   // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
   FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
@@ -475,6 +486,8 @@ EXTERN void __kmpc_data_sharing_pop_stac
     return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
   }
 
+  __threadfence_block();
+
   if (getThreadId() % WARPSIZE == 0) {
     unsigned WID = getWarpId();
 
@@ -501,8 +514,6 @@ EXTERN void __kmpc_data_sharing_pop_stac
       SlotP->Next = 0;
     }
   }
-
-  __threadfence_block();
 }
 
 // Begin a data sharing context. Maintain a list of references to shared

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=345867&r1=345866&r2=345867&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Thu Nov  1 11:08:12 2018
@@ -478,6 +478,8 @@ EXTERN void __kmpc_kernel_end_convergent
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=345867&r1=345866&r2=345867&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Thu Nov  1 11:08:12 2018
@@ -40,8 +40,6 @@ INLINE unsigned nsmid() {
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-          "Expected number of SMs is less than reported.");
   return id;
 }
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=345867&r1=345866&r2=345867&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Thu Nov  1 11:08:12 2018
@@ -123,7 +123,7 @@ enum DATA_SHARING_SIZES {
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
-  void *FramePtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
 // Additional worker slot type which is initialized with the default worker slot

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=345867&r1=345866&r2=345867&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Thu Nov  1 11:08:12 2018
@@ -188,7 +188,6 @@ INLINE void *SafeMalloc(size_t size, con
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }
 




More information about the Openmp-commits mailing list