[Openmp-commits] [openmp] r354471 - [OpenMP][libomptarget] New reduction scheme for team reductions

Wed Feb 20 06:55:55 PST 2019

Author: gbercea
Date: Wed Feb 20 06:55:55 2019
New Revision: 354471

URL: http://llvm.org/viewvc/llvm-project?rev=354471&view=rev
Log:
[OpenMP][libomptarget] New reduction scheme for team reductions

Summary:
This patch adds a more sophisticated team reduction scheme to the OpenMP libomptarget-nvptx runtime.

The scheme uses a fixed size global memory buffer whose length can be adjusted via compiler flag:
```
-fopenmp-cuda-teams-reduction-recs-num=1024
```
The global buffer is a structure of arrays (with default size of 1024 each and controlled by the above flag), one array for each reduction variable.

Values in the buffer are processed by the last team to finish executing the body of the target region.

In addition to adding support for the new flag, the compiler also emits special functions used for the reduction of the intermediate reduction values. These changes will be added in a separate compiler patch following this one.




Reviewers: ABataev, caomhin

Reviewed By: ABataev

Subscribers: guansong, jfb, jdoerfert, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D58409

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=354471&r1=354470&r2=354471&view=diff
==============================================================================

--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Wed Feb 20 06:55:55 2019
@@ -200,6 +200,7 @@ typedef void (*kmp_CopyToScratchpadFctPt
 typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
                                      int32_t index, int32_t width,
                                      int32_t reduce);
+typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
 
 // task defs
 typedef struct kmp_TaskDescr kmp_TaskDescr;
@@ -410,6 +411,12 @@ EXTERN int32_t __kmpc_nvptx_parallel_red
 EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct);
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=354471&r1=354470&r2=354471&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Wed Feb 20 06:55:55 2019
@@ -454,3 +454,144 @@ __kmpc_nvptx_teams_end_reduce_nowait_sim
   (void)atomicExch((uint32_t *)crit, 0);
 }
 
+INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
+  return checkGenericMode(loc) || IsTeamMaster(ThreadId);
+}
+
+INLINE static uint32_t roundToWarpsize(uint32_t s) {
+  if (s < WARPSIZE)
+    return 1;
+  return (s & ~(unsigned)(WARPSIZE - 1));
+}
+
+__device__ static volatile uint32_t IterCnt = 0;
+__device__ static volatile uint32_t Cnt = 0;
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
+    return 0;
+
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads =
+      checkSPMDMode(loc)
+          ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
+                                  checkRuntimeUninitialized(loc))
+          : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  __shared__ unsigned Bound;
+  __shared__ unsigned ChunkTeamCount;
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = isMaster(loc, ThreadId);
+  while (IsMaster) {
+    // Atomic read
+    Bound = atomicAdd((uint32_t *)&IterCnt, 0);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records)
+      lgcpyFct(global_buffer, ModBockId, reduce_data);
+    else
+      lgredFct(global_buffer, ModBockId, reduce_data);
+    __threadfence_system();
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount = atomicInc((uint32_t *)&Cnt, num_of_records - 1);
+  }
+  // Synchronize
+  if (checkSPMDMode(loc))
+    __kmpc_barrier(loc, global_tid);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = min(NumTeams, num_of_records);
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(global_buffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(global_buffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [warpsize] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = min(NumRecs, NumThreads);
+      if (ActiveThreads > WARPSIZE) {
+        uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / WARPSIZE;
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    atomicAdd((uint32_t *)&IterCnt, num_of_records);
+  }
+
+  return 0;
+}
+