[libc-commits] [openmp] [libc] [libc][WIP] Initial support for exhaustive math tests on the GPU (PR #73720)

Wed Nov 29 13:31:22 PST 2023

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/73720

>From 641bbcc431214cf8dc7d0189950d5e301a17ac66 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 28 Nov 2023 16:03:36 -0600
Subject: [PATCH 1/2] [Libomptarget] Output the DeviceRTL alongside the other
 libraries

Summary:
Currently, the `libomp.so` and `libomptarget.so` are emitted in the
`./lib` build directory generally. This logic is internal to the
`add_llvm_library` function we use to build `libomptarget`. The
DeviceRTl static library however is in the middle of the OpenMP runtime
build, which can vary depending on if this is a runtimes or projects
build. This patch changes this to install the DeviceRTL static library
alongside the other OpenMP libraries so they are easier to find.
---
 openmp/libomptarget/DeviceRTL/CMakeLists.txt |  6 ++++++
 openmp/libomptarget/test/lit.cfg             | 11 +++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index df8e4a5126fd443..43458517f07f2f5 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -304,4 +304,10 @@ add_library(omptarget.devicertl STATIC)
 set_target_properties(omptarget.devicertl PROPERTIES LINKER_LANGUAGE CXX)
 target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)
 
+# Install this alongside the LLVM libraries is possible.
+if(NOT OPENMP_STANDALONE_BUILD)
+  set_target_properties(omptarget.devicertl PROPERTIES 
+                        ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR})
+endif()
+
 install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OPENMP_INSTALL_LIBDIR})
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 6dab31bd35a9f31..1388be578789664 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -153,11 +153,18 @@ def remove_suffix_if_present(name):
     else:
         return name
 
+def get_devicertl():
+    for dir in [config.llvm_library_dir, config.library_dir]:
+        path = os.path.join(dir, "libomptarget.devicertl.a")
+        if os.path.exists(path):
+            return path
+    return "";
+
 def add_libraries(source):
     if config.libomptarget_has_libc:
         return source + " " + config.llvm_library_dir + "/libcgpu.a " + \
-               config.library_dir + "/libomptarget.devicertl.a"
-    return source + " " + config.library_dir + "/libomptarget.devicertl.a"
+               get_devicertl()
+    return source + " " + get_devicertl()
 
 # substitutions
 # - for targets that exist in the system create the actual command.

>From 2872d35dac05852198bcd6fd056e2b6a9be45c2f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 28 Nov 2023 15:51:39 -0600
Subject: [PATCH 2/2] [libc][WIP] Initial support for exhaustive math tests on
 the GPU

Summary:
We want to perform exhaustive math tests for the GPU implementations of
standard math functions to ensure that they are correct within some
bounds. The problem is that the current GPU test suite relies on using
RPC calls to perform host services. All the math implementations we can
compare against are implemented on the host CPU, so we cannot link
against MPFR or call the `libc` CPU math.

Due to the extreme specificity of this problem, I found it prudent to
make an entirely separate facility for exhaustive testing on the GPU.
This will use OpenMP to due host / device offloading so we can compute
on the GPU and copy it back to compare it against MPFR.

This works by manually inserting all the `libc` compiled dependncies
into the device portion of the compilation via `-Xoffload-linker`. The
downside is that this doesn't work currently on Nvidia because I cannot
use `-Wl,` due to default arguments, and it would need to be renamed
from `foo.o` to `foo.cubin`.

This is a WIP because the only test doesn't really do anything. Just
putting this up so the libc people can look at it.
---
 libc/cmake/modules/LLVMLibCCheckMPFR.cmake |   2 -
 libc/test/src/CMakeLists.txt               |   2 +-
 libc/test/src/math/CMakeLists.txt          |   5 +
 libc/test/src/math/gpu/CMakeLists.txt      | 139 +++++++++++++++++++++
 libc/test/src/math/gpu/exhaustive_test.h   | 137 ++++++++++++++++++++
 libc/test/src/math/gpu/truncf_test.cpp     |  33 +++++
 libc/utils/MPFRWrapper/CMakeLists.txt      |   2 +-
 7 files changed, 316 insertions(+), 4 deletions(-)
 create mode 100644 libc/test/src/math/gpu/CMakeLists.txt
 create mode 100644 libc/test/src/math/gpu/exhaustive_test.h
 create mode 100644 libc/test/src/math/gpu/truncf_test.cpp

diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
index 9e361f5fd811289..46f679f1330d3ba 100644
--- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
@@ -2,8 +2,6 @@ set(LLVM_LIBC_MPFR_INSTALL_PATH "" CACHE PATH "Path to where MPFR is installed (
 
 if(LLVM_LIBC_MPFR_INSTALL_PATH)
   set(LIBC_TESTS_CAN_USE_MPFR TRUE)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-  set(LIBC_TESTS_CAN_USE_MPFR FALSE)
 else()
   try_compile(
     LIBC_TESTS_CAN_USE_MPFR
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index c45b94f364397e3..10ad7ff595554f4 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -8,7 +8,7 @@ function(add_fp_unittest name)
   )
 
   if(MATH_UNITTEST_NEED_MPFR)
-    if(NOT LIBC_TESTS_CAN_USE_MPFR)
+    if(NOT LIBC_TESTS_CAN_USE_MPFR OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
       message(VERBOSE "Math test ${name} will be skipped as MPFR library is not available.")
       return()
     endif()
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index fcb47449748dcac..87869538653f58f 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1722,3 +1722,8 @@ if(NOT LLVM_LIBC_FULL_BUILD)
   add_subdirectory(exhaustive)
   add_subdirectory(differential_testing)
 endif()
+
+# The GPU build uses special case exhaustive math tests.
+if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_subdirectory(gpu)
+endif()
diff --git a/libc/test/src/math/gpu/CMakeLists.txt b/libc/test/src/math/gpu/CMakeLists.txt
new file mode 100644
index 000000000000000..a065e39ec1dac11
--- /dev/null
+++ b/libc/test/src/math/gpu/CMakeLists.txt
@@ -0,0 +1,139 @@
+add_libc_exhaustive_testsuite(libc-math-gpu-exhaustive-tests)
+
+if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+  message(WARNING "Exhaustive GPU tests are not currently supported on NVPTX")
+  return()
+endif()
+
+if((NOT "openmp" IN_LIST LLVM_ENABLE_RUNTIMES) AND
+   (NOT "openmp" IN_LIST LLVM_ENABLE_PROJECTS))
+  message(STATUS "The 'openmp' runtime must be enabled to run exhaustive "
+                 "GPU tests.")
+  return()
+endif()
+
+# Attempt to locate the libraries required for offloading.
+if(TARGET omptarget.devicertl AND TARGET omptarget AND TARGET omp)
+  set(LIBC_OPENMP_RUNTIME omptarget.devicertl omptarget omp)
+else()
+  find_library(omptarget.devicertl NAMES omptarget.devicertl
+               PATHS ${LLVM_LIBRARY_OUTPUT_INTDIR} ${LLVM_LIBRARY_DIR}
+               NO_DEFAULT_PATH
+  )
+  find_library(omptarget NAMES omptarget
+               PATHS ${LLVM_LIBRARY_OUTPUT_INTDIR} ${LLVM_LIBRARY_DIR}
+               NO_DEFAULT_PATH
+  )
+  find_library(omp NAMES omp
+               PATHS ${LLVM_LIBRARY_OUTPUT_INTDIR} ${LLVM_LIBRARY_DIR}
+               NO_DEFAULT_PATH
+  )
+  if(NOT omptarget.devicertl OR NOT omptarget OR NOT omp)
+    message(WARNING "Could not find the OpenMP runtime for exhaustive tests")
+    return()
+  endif()
+  set(LIBC_OPENMP_RUNTIME ${omptarget.devicertl} ${omptarget} ${omp})
+endif()
+
+# Ensure that the tests do not use any other libraries found on the system.
+if(${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  list(APPEND LIBC_OPENMP_RUNTIME "-Wl,-rpath,${LLVM_LIBRARY_DIR}"
+                                  "-Wl,-rpath,${LLVM_LIBRARY_OUTPUT_INTDIR}")
+endif()
+
+function(add_gpu_exhaustive_test name)
+  cmake_parse_arguments(
+    "MATH_GPU_EXHAUSTIVE_TEST"
+    "NEED_MPFR" # Optional arguments
+    "" # Single value arguments
+    "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES" # Multi-value arguments
+    ${ARGN}
+  )
+
+  if(NOT MATH_GPU_EXHAUSTIVE_TEST_SRCS)
+    message(FATAL_ERROR "'add_gpu_exhaustive_test' target requires a SRCS list "
+                        "of .cpp files.")
+  endif()
+  if(NOT MATH_GPU_EXHAUSTIVE_TEST_DEPENDS)
+    message(FATAL_ERROR "'add_gpu_exhaustive_test' target requires a DEPENDS "
+                        "list of 'add_entrypoint_object' targets.")
+  endif()
+
+  if(MATH_GPU_EXHAUSTIVE_TEST_NEED_MPFR)
+    if(NOT LIBC_TESTS_CAN_USE_MPFR)
+      message(VERBOSE "Math test ${name} will be skipped as MPFR library is "
+                      "not available.")
+      return()
+    endif()
+  endif()
+
+
+  if(MATH_GPU_EXHAUSTIVE_TEST_NEED_MPFR)
+    list(APPEND MATH_GPU_EXHAUSTIVE_TEST_LINK_LIBRARIES libcMPFRWrapper)
+  endif()
+
+  get_fq_target_name(${name} fq_target_name)
+  get_fq_deps_list(fq_deps_list ${MATH_GPU_EXHAUSTIVE_TEST_DEPENDS})
+  list(REMOVE_DUPLICATES fq_deps_list)
+
+  get_object_files_for_test(
+      link_object_files skipped_entrypoints_list ${fq_deps_list})
+  if(skipped_entrypoints_list)
+    message(STATUS "Skipping unittest ${fq_target_name} as it has missing deps:"
+                   " ${skipped_entrypoints_list}.")
+    return()
+  endif()
+
+  list(APPEND MATH_GPU_EXHAUSTIVE_TEST_LINK_LIBRARIES 
+              ${LIBC_OPENMP_RUNTIME} "--offload-link")
+  foreach(link_object_file ${link_object_files})
+    list(APPEND MATH_GPU_EXHAUSTIVE_TEST_LINK_LIBRARIES 
+                "-Xoffload-linker ${link_object_file}")
+  endforeach()
+
+  set(fq_exhaustive_target_name ${fq_target_name}.__exhaustive__)
+  add_executable(
+    ${fq_exhaustive_target_name}
+    EXCLUDE_FROM_ALL
+    ${MATH_GPU_EXHAUSTIVE_TEST_SRCS}
+    ${MATH_GPU_EXHAUSTIVE_TEST_HDRS}
+  )
+  set(LIBC_GPU_TEST_OPTIONS -fopenmp -nogpulib -nogpuinc -foffload-lto -fno-rtti
+                            -fopenmp-offload-mandatory -fpie -fno-exceptions
+                            --offload-arch=${LIBC_GPU_TARGET_ARCHITECTURE})
+
+  target_include_directories(${fq_exhaustive_target_name} PRIVATE 
+                             ${LIBC_SOURCE_DIR})
+
+  target_compile_options(${fq_exhaustive_target_name} PRIVATE 
+                         ${MATH_GPU_EXHAUSTIVE_TEST_COMPILE_OPTIONS} 
+                         ${LIBC_GPU_TEST_OPTIONS}
+  )
+  target_link_libraries(${fq_exhaustive_target_name} PRIVATE 
+                        ${MATH_GPU_EXHAUSTIVE_TEST_LINK_LIBRARIES} LibcTest.unit
+  )
+
+  add_dependencies(${fq_exhaustive_target_name} ${fq_deps_list})
+  set_target_properties(${fq_exhaustive_target_name}
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  add_custom_target(
+    ${fq_target_name}
+    COMMAND OMP_TARGET_OFFLOAD=MANDATORY 
+            $<TARGET_FILE:${fq_exhaustive_target_name}>
+    COMMAND_EXPAND_LISTS
+    COMMENT "Running exhaustive GPU test ${fq_target_name}"
+  )
+  add_dependencies(libc-math-gpu-exhaustive-tests ${fq_target_name})
+endfunction()
+
+add_gpu_exhaustive_test(
+  truncf_test
+  NEED_MPFR
+  SRCS
+    truncf_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.truncf
+    libc.src.__support.FPUtil.fp_bits
+)
diff --git a/libc/test/src/math/gpu/exhaustive_test.h b/libc/test/src/math/gpu/exhaustive_test.h
new file mode 100644
index 000000000000000..73965eac3f070ff
--- /dev/null
+++ b/libc/test/src/math/gpu/exhaustive_test.h
@@ -0,0 +1,137 @@
+//===-- Exhaustive test template for math functions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+#include <atomic>
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+// To test exhaustively for inputs in the range [start, stop) in parallel:
+// 1. Define a Checker class with:
+//    - FloatType: define floating point type to be used.
+//    - FPBits: fputil::FPBits<FloatType>.
+//    - UIntType: define bit type for the corresponding floating point type.
+//    - uint64_t check(start, stop, rounding_mode): a method to test in given
+//          range for a given rounding mode, which returns the number of
+//          failures.
+// 2. Use LlvmLibcExhaustiveMathTest<Checker> class
+// 3. Call: test_full_range(start, stop, nthreads, rounding)
+//       or test_full_range_all_roundings(start, stop).
+// * For single input single output math function, use the convenient template:
+//   LlvmLibcUnaryOpExhaustiveMathTest<FloatType, Op, Func>.
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+template <typename T> using UnaryOp = T(T);
+
+template <typename T, mpfr::Operation Op, UnaryOp<T> Func>
+struct UnaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
+  using FloatType = T;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<FloatType>;
+  using UIntType = typename FPBits::UIntType;
+
+  static constexpr UnaryOp<FloatType> *FUNC = Func;
+  static constexpr mpfr::Operation OP = Op;
+
+  // Check in a range, return the number of failures.
+  bool check(FloatType in, FloatType out, mpfr::RoundingMode rounding) {
+    mpfr::ForceRoundingMode r(rounding);
+    if (!r.success)
+      return true;
+
+    bool correct = TEST_MPFR_MATCH_ROUNDING(Op, in, out, 0.5, rounding);
+    return !correct;
+  }
+};
+
+// Checker class needs inherit from LIBC_NAMESPACE::testing::Test and provide
+// UIntType and check method.
+template <typename Checker>
+struct LlvmLibcExhaustiveMathTest
+    : public virtual LIBC_NAMESPACE::testing::Test,
+      public Checker {
+  using FloatType = typename Checker::FloatType;
+  using FPBits = typename Checker::FPBits;
+  using UIntType = typename Checker::UIntType;
+
+  static constexpr UIntType BLOCK_SIZE = (1 << 25);
+
+  // Break [start, stop) into chunks and compare results on the GPU vs the CPU.
+  void test_full_range(UIntType start, UIntType stop,
+                       mpfr::RoundingMode rounding) {
+
+    // TODO: We can run the GPU asynchronously to compute the next block.
+    // However, the main bottleneck is MPFR on the CPU.
+    uint64_t failed = 0;
+    for (UIntType chunk = start; chunk <= stop; chunk += BLOCK_SIZE) {
+      uint64_t percent = (static_cast<double>(chunk - start) /
+                          static_cast<double>(stop - start)) *
+                         100.0;
+      std::cout << percent << "% is in process     \r" << std::flush;
+      UIntType end = std::min(stop, chunk + BLOCK_SIZE);
+
+      std::vector<FloatType> data(BLOCK_SIZE, FloatType(0));
+
+      FloatType *ptr = data.data();
+      // Fill the buffer with the computed results from the GPU.
+#pragma omp target teams distribute parallel for map(from : ptr[0 : BLOCK_SIZE])
+      for (UIntType begin = chunk; begin < end; ++begin) {
+        UIntType idx = begin - chunk;
+
+        FPBits xbits(begin);
+        FloatType x = FloatType(xbits);
+
+        ptr[idx] = Checker::FUNC(x);
+      }
+
+      std::atomic<uint64_t> failed_in_range = 0;
+      // Check the GPU results against the MPFR library.
+#pragma omp parallel for default(firstprivate) shared(failed_in_range)
+      for (UIntType begin = chunk; begin < end; ++begin) {
+        UIntType idx = begin - chunk;
+
+        FPBits xbits(begin);
+        FloatType x = FloatType(xbits);
+
+        failed_in_range += Checker::check(x, data[idx], rounding);
+      }
+
+      if (failed_in_range > 0) {
+        std::stringstream msg;
+        msg << "Test failed for " << std::dec << failed_in_range
+            << " inputs in range: " << chunk << " to " << end << " [0x"
+            << std::hex << chunk << ", 0x" << end << "), [" << std::hexfloat
+            << static_cast<FloatType>(FPBits(chunk)) << ", "
+            << static_cast<FloatType>(FPBits(end)) << ")\n";
+        std::cerr << msg.str() << std::flush;
+
+        failed += failed_in_range.load();
+      }
+
+      // Check to make sure we don't overflow when updating the value.
+      if (chunk > std::numeric_limits<UIntType>::max() - BLOCK_SIZE)
+        chunk = std::numeric_limits<UIntType>::max();
+    }
+
+    std::cout << std::endl;
+    std::cout << "Test " << ((failed > 0) ? "FAILED" : "PASSED") << std::endl;
+    ASSERT_EQ(failed, uint64_t(0));
+  }
+};
+
+template <typename FloatType, mpfr::Operation Op, UnaryOp<FloatType> Func>
+using LlvmLibcUnaryOpExhaustiveMathTest =
+    LlvmLibcExhaustiveMathTest<UnaryOpChecker<FloatType, Op, Func>>;
diff --git a/libc/test/src/math/gpu/truncf_test.cpp b/libc/test/src/math/gpu/truncf_test.cpp
new file mode 100644
index 000000000000000..c708c544522f08d
--- /dev/null
+++ b/libc/test/src/math/gpu/truncf_test.cpp
@@ -0,0 +1,33 @@
+//===-- Exhaustive GPU test for truncf ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/math/truncf.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LlvmLibcTruncfExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Trunc,
+                                      LIBC_NAMESPACE::truncf>;
+
+// Range: [0, Inf];
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcTruncfExhaustiveTest, PostiveRange) {
+  test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Nearest);
+}
+
+// Range: [-Inf, 0];
+static constexpr uint32_t NEG_START = 0xb000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcTruncfExhaustiveTest, NegativeRange) {
+  test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Nearest);
+}
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 416307a20d7d181..7d3b46df369ca20 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -21,6 +21,6 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
   target_link_libraries(libcMPFRWrapper LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
-elseif(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+else()
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()