[Openmp-commits] [PATCH] D88185: [OpenMP] cmake option LIBOMPTARGET_NVPTX_MAX_SM for nvptx device RTL

Wed Sep 23 15:14:13 PDT 2020

ye-luo created this revision.
Herald added subscribers: guansong, yaxunl, mgorny.
ye-luo requested review of this revision.
Herald added a reviewer: jdoerfert.
Herald added a subscriber: sstefan1.

It allows customizing MAX_SM for non-flagship GPU and reduces graphic memory usage.

In addition, so far the size is hard-coded up to __CUDA_ARCH__ 700 and is already a hassle for 800.
Introduce MAX_SM for 800 and protect future arch


https://reviews.llvm.org/D88185

Files:
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h


Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================

--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -47,16 +47,27 @@
 
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
+#else
+#define OMP_STATE_COUNT 16
+#endif
+
+#if !defined(MAX_SM)
+#if __CUDA_ARCH__ >= 900
+#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
+#elif __CUDA_ARCH__ >= 800
+// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
+// GA102 design has a maxinum of 84 SMs
+#define MAX_SM 108
+#elif __CUDA_ARCH__ >= 700
 #define MAX_SM 84
 #elif __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
 #define MAX_SM 56
 #else
-#define OMP_STATE_COUNT 16
 #define MAX_SM 16
 #endif
+#endif
 
 #define OMP_ACTIVE_PARALLEL_LEVEL 128
 
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -82,6 +82,11 @@
     set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
   endforeach()
 
+  # Override default MAX_SM in src/target_impl.h if requested
+  if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+    set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+  endif()
+
   # Activate RTL message dumps if requested by the user.
   set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
     "Activate NVPTX device RTL debug messages.")
@@ -96,7 +101,7 @@
   list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
                               -I${devicertl_nvptx_directory}/src)
   cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
-      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})
 
   # Install device RTL under the lib destination folder.
   install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -159,7 +164,7 @@
         get_filename_component(outfile ${src} NAME)
 
         add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
             -c ${infile} -o ${outfile}-sm_${sm}.bc
           DEPENDS ${infile}
           IMPLICIT_DEPENDS CXX ${infile}


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D88185.293869.patch
Type: text/x-patch
Size: 2657 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20200923/fba089f4/attachment.bin>