[clang] [llvm] [OpenMP] Use generic IR for the OpenMP DeviceRTL (PR #119091)

Sat Dec 7 11:59:26 PST 2024

================
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   uint32_t NumThreads = omp_get_num_threads();
   if (NumThreads == 1)
     return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
-      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
 
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
   //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
+  // This reduce function handles reduction within a team. It handles
+  // parallel regions in both L1 and L2 parallelism levels. It also
+  // supports Generic, SPMD, and NoOMP modes.
+  //
+  // 1. Reduce within a warp.
+  // 2. Warp master copies value to warp 0 via shared memory.
+  // 3. Warp 0 reduces to a single value.
+  // 4. The reduced value is available in the thread that returns 1.
+  //
 
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+#if __has_builtin(__nvvm_reflect)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
----------------
jhuber6 wrote:

I believe @AlexVlx also has interest in that space, so best consult with him as well.

https://github.com/llvm/llvm-project/pull/119091