[Openmp-commits] [openmp] ffd159d - [OpenMP] cmake option LIBOMPTARGET_NVPTX_MAX_SM for nvptx device RTL
Shilei Tian via Openmp-commits
openmp-commits at lists.llvm.org
Thu Sep 24 09:40:06 PDT 2020
Author: Ye Luo
Date: 2020-09-24T12:39:59-04:00
New Revision: ffd159d8e919435561a8c9eac0dcdd83aacdcf6a
URL: https://github.com/llvm/llvm-project/commit/ffd159d8e919435561a8c9eac0dcdd83aacdcf6a
DIFF: https://github.com/llvm/llvm-project/commit/ffd159d8e919435561a8c9eac0dcdd83aacdcf6a.diff
LOG: [OpenMP] cmake option LIBOMPTARGET_NVPTX_MAX_SM for nvptx device RTL
It allows customizing MAX_SM for non-flagship GPU and reduces graphic memory usage.
In addition, so far the size is hard-coded up to __CUDA_ARCH__ 700 and is already a hassle for 800.
Introduce MAX_SM for 800 and protect future arch
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D88185
Added:
Modified:
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Removed:
################################################################################
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 84b52f55b73d..ca9e75953ff4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -82,6 +82,11 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
endforeach()
+ # Override default MAX_SM in src/target_impl.h if requested
+ if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+ set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+ endif()
+
# Activate RTL message dumps if requested by the user.
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
"Activate NVPTX device RTL debug messages.")
@@ -96,7 +101,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
-I${devicertl_nvptx_directory}/src)
cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
- OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+ OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})
# Install device RTL under the lib destination folder.
install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -159,7 +164,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
get_filename_component(outfile ${src} NAME)
add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
- COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-c ${infile} -o ${outfile}-sm_${sm}.bc
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index d009e36a522f..e3a3d0f56c4e 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -47,16 +47,27 @@
// Maximum number of omp state objects per SM allocated statically in global
// memory.
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
+#else
+#define OMP_STATE_COUNT 16
+#endif
+
+#if !defined(MAX_SM)
+#if __CUDA_ARCH__ >= 900
+#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
+#elif __CUDA_ARCH__ >= 800
+// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
+// GA102 design has a maxinum of 84 SMs
+#define MAX_SM 108
+#elif __CUDA_ARCH__ >= 700
#define MAX_SM 84
#elif __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
#define MAX_SM 56
#else
-#define OMP_STATE_COUNT 16
#define MAX_SM 16
#endif
+#endif
#define OMP_ACTIVE_PARALLEL_LEVEL 128
More information about the Openmp-commits
mailing list