[test-suite] r337570 - [CUDA, test-suite] Added test cases for the integet SIMD math functions.

Fri Jul 20 09:22:01 PDT 2018

Author: tra
Date: Fri Jul 20 09:22:01 2018
New Revision: 337570

URL: http://llvm.org/viewvc/llvm-project?rev=337570&view=rev
Log:
[CUDA, test-suite] Added test cases for the integet SIMD math functions.

CUDA-9.2 made these function compiler builtins, so clang no longer has access to
their implementation and we had to provide our own. These tests verify that the
result of these function match the ones provided by the reference
implementation.

Effectively the tests with CUDA-9.1 or older tests the reference
implementation. THe SIMD functions are provided by CUDA SDK and are the ground
truth for the test. The tests make sure that the code provided by the tests
matches it. The tests with CUDA-9.2 use clang-provided SIMD functions and the
implementation provided by the test is the ground truth.

Differential Revision: https://reviews.llvm.org/D49582

Added:
    test-suite/trunk/External/CUDA/simd.cu
Modified:
    test-suite/trunk/External/CUDA/CMakeLists.txt

Modified: test-suite/trunk/External/CUDA/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/External/CUDA/CMakeLists.txt?rev=337570&r1=337569&r2=337570&view=diff
==============================================================================

--- test-suite/trunk/External/CUDA/CMakeLists.txt (original)
+++ test-suite/trunk/External/CUDA/CMakeLists.txt Fri Jul 20 09:22:01 2018
@@ -87,6 +87,7 @@ macro(create_local_cuda_tests VariantSuf
   create_one_local_test(empty empty.cu)
   create_one_local_test(printf printf.cu)
   create_one_local_test(future future.cu)
+  create_one_local_test(simd simd.cu)
 endmacro()
 
 macro(thrust_make_test_name TestName TestSourcePath)

Added: test-suite/trunk/External/CUDA/simd.cu
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/External/CUDA/simd.cu?rev=337570&view=auto
==============================================================================
--- test-suite/trunk/External/CUDA/simd.cu (added)
+++ test-suite/trunk/External/CUDA/simd.cu Fri Jul 20 09:22:01 2018
@@ -0,0 +1,455 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <complex>  // Needed for std::min and max to work on device.
+#include <limits>
+
+int verbose = 0;
+
+#if __cplusplus >= 201103L
+#include <type_traits>
+
+// Convert a function into a functor with two arguments.  We rely on SFINAE to
+// instantiate a function template call() which will invoke FUNC() with one or
+// two arguments.
+#define F(FUNC, NELTS, NARGS)                                              \
+  typedef struct FUNC##_f {                                                \
+    static const int num_args = NARGS;                                     \
+    static const int num_elts = NELTS;                                     \
+    template <typename T, int NA = num_args>                               \
+    __device__ static typename std::enable_if<NA == 1, unsigned int>::type \
+    call(T a, T b) {                                                       \
+      return FUNC(a);                                                      \
+    }                                                                      \
+    template <typename T, int NA = num_args>                               \
+    __device__ static typename std::enable_if<NA == 2, unsigned int>::type \
+    call(T a, T b) {                                                       \
+      return FUNC(a, b);                                                   \
+    }                                                                      \
+  } FUNC##_f
+
+template <int N, typename T>
+__device__ unsigned int pack(T a[N]) {
+  unsigned int mask = (N == 2) ? 0xffff : 0xff;
+  unsigned int shift = (N == 2) ? 16 : 8;
+  unsigned int r = 0;
+  for (int i = 0; i < N; ++i) {
+    r |= ((unsigned int)a[i] & mask) << (shift * i);
+  }
+  return r;
+}
+template <int N, typename T>
+__device__ void unpack(unsigned int r, T (&a)[N]) {
+  unsigned int mask = (N == 2) ? 0xffff : 0xff;
+  unsigned int shift = (N == 2) ? 16 : 8;
+  for (int i = 0; i < N; ++i) {
+    a[i] = ((r >> (shift * i)) & mask);
+  }
+}
+
+enum op_t {
+  OP_ABS,
+  OP_ABSDIFF,
+  OP_ABSS,
+  OP_ADD,
+  OP_ADDS,
+  OP_AVG,
+  OP_CMPEQ,
+  OP_CMPGE,
+  OP_CMPGT,
+  OP_CMPLE,
+  OP_CMPLT,
+  OP_CMPNE,
+  OP_HADD,
+  OP_MAX,
+  OP_MIN,
+  OP_NEG,
+  OP_SAD,
+  OP_SETEQ,
+  OP_SETGE,
+  OP_SETGT,
+  OP_SETLE,
+  OP_SETLT,
+  OP_SETNE,
+  OP_SUB,
+  OP_SUBS,
+  OP_LAST
+};
+
+template <enum op_t OP, typename T>
+__device__ inline T elt_op(T a, T b = INT_MIN) {
+  switch (OP) {
+    case OP_ABS:
+      if (!std::numeric_limits<T>::is_signed) return a;
+      // This is wrong, but that's what __vabsN() returns.  We also need to
+      // handle that because abs(std::numeric_limits<T>::min()) would be an
+      // undefined behavior otherwise.
+      if (a == std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return (a >= 0) ? a : -a;
+
+    case OP_ABSDIFF:
+      return std::abs(a - b);
+    case OP_ABSS: {
+      int result = std::abs(a);
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      return result;
+    }
+    case OP_ADD:
+      return a + b;
+    case OP_ADDS: {
+      int result = (int)a + (int)b;
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      if (std::numeric_limits<T>::is_signed &&
+          result < std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return result;
+    }
+    case OP_AVG:
+      // This is *rounded* average. For simplicity let FP do the
+      // rounding. Considering that T is byte or short, we're guaranteed not to
+      // lose any bits.
+      return round(((float)a + (float)b) / 2.0f);
+    case OP_CMPEQ:
+      return a == b ? -1 : 0;
+    case OP_CMPGE:
+      return a >= b ? -1 : 0;
+    case OP_CMPGT:
+      return a > b ? -1 : 0;
+    case OP_CMPLE:
+      return a <= b ? -1 : 0;
+    case OP_CMPLT:
+      return a < b ? -1 : 0;
+    case OP_CMPNE:
+      return a != b ? -1 : 0;
+    case OP_HADD:
+      return (a + b) / 2;
+    case OP_MAX:
+      return std::max(a, b);
+    case OP_MIN:
+      return std::min(a, b);
+    case OP_NEG:
+      // This is wrong, but that's what __vnegN() returns.  We also need to
+      // handle that because abs(std::numeric_limits<T>::min()) would be an
+      // undefined behavior otherwise.
+      if (std::numeric_limits<T>::is_signed &&
+          a == std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return -a;
+    case OP_SAD:
+      return std::abs(a - b);  // need to sum per-element results later.
+    case OP_SETEQ:
+      return a == b ? 1 : 0;
+    case OP_SETGE:
+      return a >= b ? 1 : 0;
+    case OP_SETGT:
+      return a > b ? 1 : 0;
+    case OP_SETLE:
+      return a <= b ? 1 : 0;
+    case OP_SETLT:
+      return a < b ? 1 : 0;
+    case OP_SETNE:
+      return a != b ? 1 : 0;
+    case OP_SUB:
+      return a - b;
+    case OP_SUBS: {
+      int result = (int)a - (int)b;
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      if (result < std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return result;
+    }
+    default:
+      assert(false && "unknown OP");
+  }
+  assert(false && "Unreachable.");
+  return 0;
+}
+
+template <op_t OP, typename T, int N>
+__device__ void simd_op(T (&r)[N], T a[N], T b[N]) {
+  if (OP == OP_SAD) {
+    // Sum up all elements in r[0] and clear the rest of r.
+    int result = 0;
+    for (int i = 0; i < N; ++i) {
+      result += elt_op<OP, T>(a[i], b[i]);
+      r[i] = 0;
+    }
+    r[0] = result;
+  } else {
+    // Just an element-wise op.
+    for (int i = 0; i < N; ++i) {
+      r[i] = elt_op<OP, T>(a[i], b[i]);
+    }
+  }
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+__device__ void test_func(int verbose, int a, int b) {
+  constexpr int N = SIMD_OP::num_elts;
+  int dummy_args[] = {0,
+                      1,
+                      -1,
+                      std::numeric_limits<T>::max(),
+                      std::numeric_limits<T>::max() - 1,
+                      std::numeric_limits<T>::min(),
+                      std::numeric_limits<T>::min() + 1};
+  for (T x : dummy_args) {
+    for (int e = 0; e < N; ++e) {
+      T args_a[N];
+      T args_b[N];
+      for (int i = 0; i < N; ++i) {
+        args_a[i] = x;
+        args_b[i] = x;
+      }
+      args_a[e] = a;
+      args_b[e] = b;
+      unsigned int va = pack<N, T>(args_a);
+      unsigned int vb = pack<N, T>(args_b);
+      T expected_r[N];
+      simd_op<OP, T>(expected_r, args_a, args_b);
+      unsigned int evr = pack<N, T>(expected_r);
+      // This is weird and I don't understand what's going on.  With T = short,
+      // compiler ends up generating code which triggers the assert below
+      // if verbose == false, but triggers no assert if verbose == 1. It may be
+      // due to an undefined behavior somewhere, but the same code (with SIMD_OP
+      // below replaced with a pack(simd_op(a,b)) (so it could run on host)
+      // triggerend no ubsan reports.
+      asm volatile("" ::: "memory");
+      unsigned int vr = SIMD_OP::call(va, vb);
+      if (verbose && vr != evr) {
+        printf("e=%d a=%d b=%d va=%08x vb=%08x vr=%08x expected vr=%08x\n", e,
+               a, b, va, vb, vr, evr);
+      }
+      assert((vr == evr) && "Value mismatch");
+    }
+  }
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+__global__ void test_kernel(int verbose) {
+  int a = blockIdx.x * blockDim.x + threadIdx.x;
+  int b = blockIdx.y * blockDim.y + threadIdx.y;
+  test_func<OP, SIMD_OP, T>(verbose, a, b);
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+void test_op() {
+  int elements_a = SIMD_OP::num_elts == 2 ? 0x10000 : 0x100;
+  // Collapse second dimension if we test single-operand function.
+  int elements_b = SIMD_OP::num_args == 2 ? elements_a : 0;
+  dim3 grid_size(elements_a / 32, elements_b ? elements_b / 32 : 1, 1);
+  dim3 block_size(32, elements_b ? 32 : 1, 1);
+  printf("Testing %s...", __PRETTY_FUNCTION__);
+  test_kernel<OP, SIMD_OP, T><<<grid_size, block_size>>>(verbose);
+  cudaError_t err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("%s failed\n", __PRETTY_FUNCTION__);
+    printf("CUDA error %d\n", (int)err);
+    exit(EXIT_FAILURE);
+  } else {
+    printf("OK\n");
+  }
+}
+
+// Define functor types which we can then use to parametrize device-side tests.
+// F(function, num-elements, num-args)
+F(__vabs2, 2, 1);
+F(__vabs4, 4, 1);
+F(__vabsdiffs2, 2, 2);
+F(__vabsdiffs4, 4, 2);
+F(__vabsdiffu2, 2, 2);
+F(__vabsdiffu4, 4, 2);
+F(__vabsss2, 2, 1);
+F(__vabsss4, 4, 1);
+F(__vadd2, 2, 2);
+F(__vadd4, 4, 2);
+F(__vaddss2, 2, 2);
+F(__vaddus2, 2, 2);
+F(__vaddss4, 4, 2);
+F(__vaddus4, 4, 2);
+F(__vavgs2, 2, 2);
+F(__vavgu2, 2, 2);
+F(__vavgs4, 4, 2);
+F(__vavgu4, 4, 2);
+F(__vcmpeq2, 2, 2);
+F(__vcmpeq4, 4, 2);
+F(__vcmpges2, 2, 2);
+F(__vcmpges4, 4, 2);
+F(__vcmpgeu2, 2, 2);
+F(__vcmpgeu4, 4, 2);
+F(__vcmpgts2, 2, 2);
+F(__vcmpgts4, 4, 2);
+F(__vcmpgtu2, 2, 2);
+F(__vcmpgtu4, 4, 2);
+F(__vcmples2, 2, 2);
+F(__vcmples4, 4, 2);
+F(__vcmpleu2, 2, 2);
+F(__vcmpleu4, 4, 2);
+F(__vcmplts2, 2, 2);
+F(__vcmplts4, 4, 2);
+F(__vcmpltu2, 2, 2);
+F(__vcmpltu4, 4, 2);
+F(__vcmpne2, 2, 2);
+F(__vcmpne4, 4, 2);
+F(__vhaddu2, 2, 2);
+F(__vhaddu4, 4, 2);
+F(__vmaxs2, 2, 2);
+F(__vmaxs4, 4, 2);
+F(__vmaxu2, 2, 2);
+F(__vmaxu4, 4, 2);
+F(__vmins2, 2, 2);
+F(__vmins4, 4, 2);
+F(__vminu2, 2, 2);
+F(__vminu4, 4, 2);
+F(__vneg2, 2, 1);
+F(__vneg4, 4, 1);
+F(__vsads2, 2, 2);
+F(__vsadu2, 2, 2);
+F(__vsads4, 4, 2);
+F(__vsadu4, 4, 2);
+F(__vseteq2, 2, 2);
+F(__vseteq4, 4, 2);
+F(__vsetges2, 2, 2);
+F(__vsetges4, 4, 2);
+F(__vsetgeu2, 2, 2);
+F(__vsetgeu4, 4, 2);
+F(__vsetgts2, 2, 2);
+F(__vsetgts4, 4, 2);
+F(__vsetgtu2, 2, 2);
+F(__vsetgtu4, 4, 2);
+F(__vsetles2, 2, 2);
+F(__vsetles4, 4, 2);
+F(__vsetleu2, 2, 2);
+F(__vsetleu4, 4, 2);
+F(__vsetlts2, 2, 2);
+F(__vsetlts4, 4, 2);
+F(__vsetltu2, 2, 2);
+F(__vsetltu4, 4, 2);
+F(__vsetne2, 2, 2);
+F(__vsetne4, 4, 2);
+F(__vsub2, 2, 2);
+F(__vsub4, 4, 2);
+F(__vsubss2, 2, 2);
+F(__vsubus2, 2, 2);
+F(__vsubss4, 4, 2);
+F(__vsubus4, 4, 2);
+
+void tests() {
+  test_op<OP_NEG, __vneg2_f, short>();
+  test_op<OP_ABS, __vabs2_f, short>();
+  test_op<OP_ABS, __vabs4_f, signed char>();
+  test_op<OP_ABSDIFF, __vabsdiffs2_f, short>();
+  test_op<OP_ABSDIFF, __vabsdiffs4_f, signed char>();
+  test_op<OP_ABSDIFF, __vabsdiffu2_f, unsigned short>();
+  test_op<OP_ABSDIFF, __vabsdiffu4_f, unsigned char>();
+  test_op<OP_ABSS, __vabsss2_f, short>();
+  test_op<OP_ABSS, __vabsss4_f, signed char>();
+  test_op<OP_ADD, __vadd2_f, short>();
+  test_op<OP_ADD, __vadd4_f, signed char>();
+  test_op<OP_ADDS, __vaddss2_f, short>();
+  test_op<OP_ADDS, __vaddss4_f, signed char>();
+  test_op<OP_ADDS, __vaddus2_f, unsigned short>();
+  test_op<OP_ADDS, __vaddus4_f, unsigned char>();
+  test_op<OP_AVG, __vavgs2_f, short>();
+  test_op<OP_AVG, __vavgs4_f, signed char>();
+  test_op<OP_AVG, __vavgu2_f, unsigned short>();
+  test_op<OP_AVG, __vavgu4_f, unsigned char>();
+  test_op<OP_CMPEQ, __vcmpeq2_f, short>();
+  test_op<OP_CMPEQ, __vcmpeq4_f, signed char>();
+  test_op<OP_CMPGE, __vcmpges2_f, short>();
+  test_op<OP_CMPGE, __vcmpges4_f, signed char>();
+  test_op<OP_CMPGE, __vcmpgeu2_f, unsigned short>();
+  test_op<OP_CMPGE, __vcmpgeu4_f, unsigned char>();
+  test_op<OP_CMPGT, __vcmpgts2_f, short>();
+  test_op<OP_CMPGT, __vcmpgts4_f, signed char>();
+  test_op<OP_CMPGT, __vcmpgtu2_f, unsigned short>();
+  test_op<OP_CMPGT, __vcmpgtu4_f, unsigned char>();
+  test_op<OP_CMPLE, __vcmples2_f, short>();
+  test_op<OP_CMPLE, __vcmples4_f, signed char>();
+  test_op<OP_CMPLE, __vcmpleu2_f, unsigned short>();
+  test_op<OP_CMPLE, __vcmpleu4_f, unsigned char>();
+  test_op<OP_CMPLT, __vcmplts2_f, short>();
+  test_op<OP_CMPLT, __vcmplts4_f, signed char>();
+  test_op<OP_CMPLT, __vcmpltu2_f, unsigned short>();
+  test_op<OP_CMPLT, __vcmpltu4_f, unsigned char>();
+  test_op<OP_CMPNE, __vcmpne2_f, short>();
+  test_op<OP_CMPNE, __vcmpne4_f, signed char>();
+  test_op<OP_HADD, __vhaddu2_f, unsigned short>();
+  test_op<OP_HADD, __vhaddu4_f, unsigned char>();
+  test_op<OP_MAX, __vmaxs2_f, short>();  // ??? Fails?
+  test_op<OP_MAX, __vmaxs4_f, signed char>();
+  test_op<OP_MAX, __vmaxu2_f, unsigned short>();
+  test_op<OP_MAX, __vmaxu4_f, unsigned char>();
+  test_op<OP_MIN, __vmins2_f, short>();
+  test_op<OP_MIN, __vmins4_f, signed char>();
+  test_op<OP_MIN, __vminu2_f, unsigned short>();
+  test_op<OP_MIN, __vminu4_f, unsigned char>();
+  test_op<OP_NEG, __vneg2_f, short>();
+  test_op<OP_NEG, __vneg4_f, signed char>();
+  test_op<OP_SAD, __vsads2_f, short>();
+  test_op<OP_SAD, __vsads4_f, signed char>();
+  test_op<OP_SAD, __vsadu2_f, unsigned short>();
+  test_op<OP_SAD, __vsadu4_f, unsigned char>();
+  test_op<OP_SETEQ, __vseteq2_f, short>();
+  test_op<OP_SETEQ, __vseteq4_f, signed char>();
+  test_op<OP_SETGE, __vsetges2_f, short>();
+  test_op<OP_SETGE, __vsetges4_f, signed char>();
+  test_op<OP_SETGE, __vsetgeu2_f, unsigned short>();
+  test_op<OP_SETGE, __vsetgeu4_f, unsigned char>();
+  test_op<OP_SETGT, __vsetgts2_f, short>();
+  test_op<OP_SETGT, __vsetgts4_f, signed char>();
+  test_op<OP_SETGT, __vsetgtu2_f, unsigned short>();
+  test_op<OP_SETGT, __vsetgtu4_f, unsigned char>();
+  test_op<OP_SETLE, __vsetles2_f, short>();
+  test_op<OP_SETLE, __vsetles4_f, signed char>();
+  test_op<OP_SETLE, __vsetleu2_f, unsigned short>();
+  test_op<OP_SETLE, __vsetleu4_f, unsigned char>();
+  test_op<OP_SETLT, __vsetlts2_f, short>();
+  test_op<OP_SETLT, __vsetlts4_f, signed char>();
+  test_op<OP_SETLT, __vsetltu2_f, unsigned short>();
+  test_op<OP_SETLT, __vsetltu4_f, unsigned char>();
+  test_op<OP_SETNE, __vsetne2_f, short>();
+  test_op<OP_SETNE, __vsetne4_f, signed char>();
+  test_op<OP_SUB, __vsub2_f, short>();
+  test_op<OP_SUB, __vsub4_f, signed char>();
+  test_op<OP_SUBS, __vsubss2_f, short>();
+  test_op<OP_SUBS, __vsubss4_f, signed char>();
+  test_op<OP_SUBS, __vsubus2_f, unsigned short>();
+  test_op<OP_SUBS, __vsubus4_f, unsigned char>();
+}
+#else  // !C++11
+void tests() {
+  // These tests need C++11 to compile.
+}
+#endif
+
+int main(int argc, char** argv) {
+  int opt;
+  while ((opt = getopt(argc, argv, "v")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = 1;
+        break;
+      default: /* '?' */
+        fprintf(stderr, "Usage: %s [-v]\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+  }
+
+  tests();
+  printf("Success!\n");
+  return 0;
+}